aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMoritz Pflanzer <moritz.pflanzer@arm.com>2017-09-15 10:42:58 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit80373f607cb12693824411510c39e367a4dfbdb5 (patch)
treeddc4d038783ed91ff227fb259a85fefc09e46319
parentc09314a288dc2aa7ef75a09a8ff5dede3f80974a (diff)
downloadComputeLibrary-80373f607cb12693824411510c39e367a4dfbdb5.tar.gz
COMPMID-481: Add AArch32 GEMM
Change-Id: Idba0b30bfb27866a46a22388014ab81432ea28dc Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86196 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
-rw-r--r--SConscript3
-rw-r--r--arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h44
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp68
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp344
-rw-r--r--arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp170
-rw-r--r--arm_compute/core/NEON/kernels/assembly/merges/list.hpp4
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp153
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp127
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/list.hpp4
-rw-r--r--src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp127
-rw-r--r--src/runtime/IScheduler.cpp8
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp19
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp29
-rw-r--r--tests/networks/AlexNetNetwork.h2
-rw-r--r--tests/validation/fixtures/ConvolutionLayerFixture.h4
15 files changed, 1085 insertions, 21 deletions
diff --git a/SConscript b/SConscript
index 26bcc4a6ec..15ef090289 100644
--- a/SConscript
+++ b/SConscript
@@ -169,6 +169,9 @@ if env['neon']:
core_files += Glob('src/core/NEON/*.cpp')
core_files += Glob('src/core/NEON/kernels/*.cpp')
+ if env['arch'] == "armv7a":
+ core_files += Glob('src/core/NEON/kernels/arm32/*.cpp')
+
if "arm64-v8" in env['arch']:
core_files += Glob('src/core/NEON/kernels/arm64/*.cpp')
diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
new file mode 100644
index 0000000000..597acca439
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch32/armv7a NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMAArch32Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
new file mode 100644
index 0000000000..c49633666f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+// Actual kernel implementations
+#include "a32_sgemm_8x6/generic.hpp"
+
+// 8x6 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_8x6 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 6;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
+
+ /* Same for B input */
+ static const int B_interleave = 8;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 8;
+ static const int out_height = 6;
+ static const int k_unroll = 1;
+
+ kern_type kernel = nullptr;
+
+ sgemm_8x6(const CPUInfo *ci) {
+ kernel = a32_sgemm_8x6;
+ }
+};
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
new file mode 100644
index 0000000000..c8cd6a33f3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "../../asmlib.hpp"
+
+#include <arm_neon.h>
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+inline void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ int tails = (K & 3);
+ if (tails == 0) {
+ tails = 4;
+ }
+ int k = ((K+3)/4) - 1;
+
+ __asm __volatile (
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+ "vmov.i32 q6, #0\n"
+ ASM_PREFETCH("[%[a_ptr], #48]")
+ "vmov.i32 q7, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #48]")
+ "vmov.i32 q8, #0\n"
+ ASM_PREFETCH("[%[a_ptr], #112]")
+ "vmov.i32 q9, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #112]")
+ "vmov.i32 q10, #0\n"
+ "vmov.i32 q11, #0\n"
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n"
+ ASM_PREFETCH("[%[a_ptr], #176]")
+ "vmov.i32 q14, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #176]")
+ "vmov.i32 q15, #0\n"
+
+ "1:\n"
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #208]")
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #240]")
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #208]")
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+ }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp
new file mode 100644
index 0000000000..ddd67e8ee2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "../asmlib.hpp"
+
+#include <arm_neon.h>
+
+template<>
+inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
+ const float *inptr = in;
+// prefetch_6x(inptr);
+// prefetch_6x(inptr + 96);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for (int y=y0; y<ymax; y+=8) {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+
+// prefetch_2x(outptr0);
+// prefetch_2x(outptr1);
+// prefetch_2x(outptr2);
+// prefetch_2x(outptr3);
+// prefetch_2x(outptr4);
+// prefetch_2x(outptr5);
+
+ for (int i=x0; i<xmax; i+=8) {
+ float dummyres[8];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if ((y+5) >= ymax) {
+ switch ((y + 5) - ymax) {
+ case 4:
+ outptr1 = dummyres;
+ case 3:
+ outptr2 = dummyres;
+ case 2:
+ outptr3 = dummyres;
+ case 1:
+ outptr4 = dummyres;
+ case 0:
+ outptr5 = dummyres;
+ default:
+ break;
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if ((i+7) >= xmax) {
+ for (int xi=0; xi<8; xi++) {
+ if ((i+xi) < xmax) {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
+ outptr5++;
+ }
+ }
+ inptr += 48;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ // Rows 0-1
+ "VLD1.32 {d8-d11}, [%[outptr0]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr1]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n"
+ ASM_PREFETCH("[%[inptr], #352]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr0]]!\n"
+ ASM_PREFETCH("[%[inptr], #416]")
+ "VMLA.f32 q6, q2, %q[av]\n"
+ ASM_PREFETCH("[%[inptr], #480]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr1]]!\n"
+
+ // Rows 2-3
+ "VLD1.32 {d8-d11}, [%[outptr2]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr3]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n"
+ ASM_PREFETCH("[%[outptr0], #96]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr2]]!\n"
+ ASM_PREFETCH("[%[outptr1], #96]")
+ "VMLA.f32 q6, q2, %q[av]\n"
+ ASM_PREFETCH("[%[outptr2], #96]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr3]]!\n"
+
+ // Rows 4-5
+ "VLD1.32 {d8-d11}, [%[outptr4]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr5]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n"
+ ASM_PREFETCH("[%[outptr3], #96]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr4]]!\n"
+ ASM_PREFETCH("[%[outptr4], #96]")
+ "VMLA.f32 q6, q2, %q[av]\n"
+ ASM_PREFETCH("[%[outptr5], #128]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr5]]!\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+ [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
+ : [av] "w" (av), [bv] "w" (bv)
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+ );
+ }
+ }
+ }
+}
+
+#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
index 4f23333ef1..29b915a75d 100644
--- a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
@@ -21,4 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "a32_merge_float_8x6.hpp"
#include "a64_merge_float_12x8.hpp"
+//#include "a64_merge_float_to_half_12x8.hpp"
+//#include "a64_merge_half_24x8.hpp"
+//#include "a64_merge_int32_12x8.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp
new file mode 100644
index 0000000000..1c1f85c11c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "../asmlib.hpp"
+
+#include <arm_neon.h>
+
+template<>
+template<typename T>
+void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+ uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
+ const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+
+ uint32_t zerobuff[8];
+
+ for (int y=y0; y<ymax; y+=6) {
+ const uint32_t *inptr0 = inptr + y * ldin + k0;
+ const uint32_t *inptr1 = inptr0 + ldin;
+ const uint32_t *inptr2 = inptr1 + ldin;
+ const uint32_t *inptr3 = inptr2 + ldin;
+ const uint32_t *inptr4 = inptr3 + ldin;
+ const uint32_t *inptr5 = inptr4 + ldin;
+
+ //prefetch_2x(inptr0);
+ //prefetch_2x(inptr1);
+ //prefetch_2x(inptr2);
+ //prefetch_2x(inptr3);
+ //prefetch_2x(inptr4);
+ //prefetch_2x(inptr5);
+
+ int x=(kmax-k0);
+ for (;x>7;x-=8) {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if ((y + 5) >= ymax) {
+ switch ((y + 5) - ymax) {
+ /* Everything falls through in here */
+ case 4:
+ inptr1 = zerobuff;
+ case 3:
+ inptr2 = zerobuff;
+ case 2:
+ inptr3 = zerobuff;
+ case 1:
+ inptr4 = zerobuff;
+ case 0:
+ inptr5 = zerobuff;
+ default:
+ break;
+ }
+ }
+
+
+ __asm __volatile (
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
+ "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
+ "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
+ "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
+ "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+ "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
+ "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
+ "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
+ "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+ // Store first elements
+ "VST1.32 {d0-d1}, [%[outptr]]!\n"
+ "VST1.32 {d16}, [%[outptr]]!\n"
+
+ "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+ // Store second elements
+ "VST1.32 {d4-d5}, [%[outptr]]!\n"
+ "VZIP.32 q1, q5\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
+ "VST1.32 {d17}, [%[outptr]]!\n"
+ "VZIP.32 q3, q7\n"
+
+ // Store third elements
+ "VZIP.32 q9, q11\n"
+ "VST1.32 {d8-d9}, [%[outptr]]!\n"
+ "VZIP.32 q1, q3\n"
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "VST1.32 {d20}, [%[outptr]]!\n"
+
+ // Store fourth elements
+ "VZIP.32 q5, q7\n"
+ "VST1.32 {d12-d13}, [%[outptr]]!\n"
+ ASM_PREFETCH("[%[inptr3], #128]")
+ "VST1.32 {d21}, [%[outptr]]!\n"
+
+ // Fifth
+ "VST1.32 {d2-d3}, [%[outptr]]!\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
+ "VST1.32 {d18}, [%[outptr]]!\n"
+
+ // Sixth
+ "VST1.32 {d6-d7}, [%[outptr]]!\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
+ "VST1.32 {d19}, [%[outptr]]!\n"
+
+ // Seventh
+ "VST1.32 {d10-d11}, [%[outptr]]!\n"
+ "VST1.32 {d22}, [%[outptr]]!\n"
+
+ // Eigth
+ "VST1.32 {d14-d15}, [%[outptr]]!\n"
+ "VST1.32 {d23}, [%[outptr]]!\n"
+
+ : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+ [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
+ );
+ }
+
+ for (;x>0;x--) {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ }
+ }
+}
+
+#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp
new file mode 100644
index 0000000000..a5a5a1058f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 8x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, true, 4, 4>::Transform(
+ T* out, const T* const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax
+) {
+ // Redirect to a 16x uint16_t specialisation
+ TransformImpl<16, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t * const>(in),
+ stride*2, x0*2, xmax*2, k0, kmax
+ );
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+ T* out, const T* const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax
+) {
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t * const>(in),
+ stride, x0, xmax, k0, kmax
+ );
+}
+
+// Specialised 16 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+ __asm volatile (
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n"
+ ASM_PREFETCH("[%[in0], #192]")
+ : [in0] "+r" (in0),
+ [out] "+r" (out)
+ :
+ : "q0", "q1", "memory"
+ );
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
+ __asm volatile (
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
+ ASM_PREFETCH("[%[in0], #192]")
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n"
+ ASM_PREFETCH("[%[in1], #192]")
+ "SUB %[out], %[out], #32\n"
+ : [in0] "+r" (in0),
+ [in1] "+r" (in1),
+ [out] "+r" (out)
+ :
+ : "q0", "q1", "memory"
+ );
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+ __asm __volatile (
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
+ ASM_PREFETCH("[%[in0], #192]")
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
+ ASM_PREFETCH("[%[in1], #192]")
+ "VLD1.32 {d0-d3}, [%[in2]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n"
+ ASM_PREFETCH("[%[in2], #192]")
+ "VLD1.32 {d0-d3}, [%[in3]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n"
+ ASM_PREFETCH("[%[in3], #192]")
+ "SUB %[out], %[out], #96\n"
+ : [in0] "+r" (in0),
+ [in1] "+r" (in1),
+ [in2] "+r" (in2),
+ [in3] "+r" (in3),
+ [out] "+r" (out)
+ :
+ : "q0", "q1", "memory"
+ );
+}
+
+template <>
+template <>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+ uint16_t* out, const uint16_t* const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax
+) {
+ TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
index 3cf6b41ffa..13e1b5468b 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-//#include "a32_interleave_6way_32bit.hpp"
-//#include "a32_transpose_interleave_8way_32bit.hpp"
+#include "a32_interleave_6way_32bit.hpp"
+#include "a32_transpose_interleave_8way_32bit.hpp"
//#include "a64_interleave_8way_16bit.hpp"
#include "a64_interleave_8way_32bit.hpp"
//#include "a64_interleave_8way_half_to_float.hpp"
diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
new file mode 100644
index 0000000000..ad0743b50f
--- /dev/null
+++ b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _transform_0 = transform_0;
+ _transform_1 = transform_1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info());
+
+ AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6);
+
+ const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6);
+ const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8);
+
+ update_window_and_padding(win,
+ AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+ AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+ output_access);
+
+ INEKernel::configure(win);
+}
+
+void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+ const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+ const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+ const int N = _output->info()->tensor_shape().x();
+ const int K = _input0->info()->tensor_shape().x();
+
+ // Only iterate over batches
+ Window win(window);
+ win.set(0, Window::Dimension(0, 1, 1));
+ win.set(1, Window::Dimension(0, 1, 1));
+
+ Iterator in0(_input0, window);
+ Iterator out(_output, window);
+
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ constexpr size_t alignment = 4096;
+ const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+ void *workspace = _workspace->buffer() + offset;
+ size_t workspace_size = _workspace->info()->total_size();
+
+ if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+ {
+ ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+ }
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+ reinterpret_cast<const float *>(in1_ptr), ldb,
+ reinterpret_cast<float *>(out.ptr()), ldc,
+ _alpha, _beta, workspace);
+ },
+ in0, out);
+}
+} // namespace arm_compute
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 1745764bbb..4292469d14 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -135,11 +135,13 @@ IScheduler::IScheduler()
_info.CPU = CPUTarget::A53;
break;
default:
-#ifdef __aarch64__
+#ifdef __arm__
+ _info.CPU = CPUTarget::ARMV7;
+#elif __aarch64__
_info.CPU = CPUTarget::ARMV8;
-#else /* __aarch64__ */
+#else /* __arm__ || __aarch64__ */
_info.CPU = CPUTarget::INTRINSICS;
-#endif /* __aarch64__ */
+#endif /* __arm__ || __aarch64__ */
break;
}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 44bf2de70c..cbe3b65c34 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
@@ -34,6 +35,7 @@
namespace arm_compute
{
#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
} // namespace arm_compute
@@ -151,12 +153,17 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
// Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
_is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
-#if defined(__aarch64__)
+#if defined(__arm__)
+ if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+ }
+#elif defined(__aarch64__)
if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
{
_mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
}
-#endif /* defined(__aarch64__) */
+#endif /* defined(__arm__) || defined(__aarch64__) */
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
@@ -240,7 +247,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-#if defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__)
if(_mm_optimised_kernel != nullptr)
{
struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -249,7 +256,11 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
const int N = _gemm_output.info()->tensor_shape().x();
const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+#if defined(__arm__)
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
constexpr size_t alignment = 4096;
_workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
@@ -268,7 +279,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
_workspace.allocator()->allocate();
}
else
-#endif /* defined(__aarch64__) */
+#endif /* defined(__arm__) || defined(__aarch64__) */
{
if(_is_fully_connected_convolution)
{
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 1d6aa65e37..ff92ef8351 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
@@ -37,6 +38,7 @@
namespace arm_compute
{
#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
} // namespace arm_compute
@@ -68,13 +70,6 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-#if defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
-#endif /* defined(__aarch64__) */
-
// Check if the first input tensor is a vector.
// If so, all the kernels for reshaping the tensors can be skipped
if(_run_vector_matrix_multiplication)
@@ -91,7 +86,19 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
}
else
{
-#if defined(__aarch64__)
+#if defined(__arm__)
+ if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+ }
+#elif defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+ }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+#if defined(__arm__) || defined(__aarch64__)
if(_mm_optimised_kernel != nullptr)
{
struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -100,7 +107,11 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
const int N = d->info()->tensor_shape().x();
const int K = a->info()->tensor_shape().x();
+#if defined(__arm__)
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
constexpr size_t alignment = 4096;
_workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
@@ -112,7 +123,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
_workspace.allocator()->allocate();
}
else
-#endif /* defined(__aarch64__) */
+#endif /* defined(__arm__) || defined(__aarch64__) */
{
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
diff --git a/tests/networks/AlexNetNetwork.h b/tests/networks/AlexNetNetwork.h
index 0c06c1860f..448cf31914 100644
--- a/tests/networks/AlexNetNetwork.h
+++ b/tests/networks/AlexNetNetwork.h
@@ -100,7 +100,7 @@ public:
{
auto reshape = [&](unsigned int width, unsigned int height, bool convolution_layer) -> TensorShape
{
- const bool is_optimised = std::is_same<ITensorType, ITensor>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
+ const bool is_optimised = std::is_same<ITensorType, ITensor>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV7 && data_type == DataType::F32;
if(convolution_layer && is_optimised)
{
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index fcaf4ef42b..434291b58e 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -88,7 +88,7 @@ protected:
{
// Check if its a "fully connected" convolution
const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
- const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
+ const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV7 && data_type == DataType::F32;
reshaped_weights_shape.collapse(3);
@@ -143,7 +143,7 @@ protected:
if(!reshape_weights)
{
const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
- const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
+ const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV7 && data_type == DataType::F32;
TensorShape tmp_weights_shape(weights_shape);
SimpleTensor<T> tmp_weights(tmp_weights_shape, data_type, 1, fixed_point_position);