aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--SConscript3
-rw-r--r--arm_compute/core/CPP/CPPTypes.h13
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h85
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h44
-rw-r--r--arm_compute/core/NEON/kernels/assembly/asmlib.hpp121
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_common.hpp33
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp176
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp72
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp367
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp358
-rw-r--r--arm_compute/core/NEON/kernels/assembly/mergeresults.hpp59
-rw-r--r--arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp236
-rw-r--r--arm_compute/core/NEON/kernels/assembly/merges/list.hpp24
-rw-r--r--arm_compute/core/NEON/kernels/assembly/profiler.hpp97
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transform.hpp110
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp174
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/list.hpp32
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp139
-rw-r--r--arm_compute/runtime/IScheduler.h26
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h34
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMM.h22
-rwxr-xr-xscripts/add_copyright.py2
-rwxr-xr-xscripts/check_bad_style.sh16
-rwxr-xr-xscripts/clang_tidy_rules.py6
-rwxr-xr-xscripts/fix_code_formatting.sh5
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp16
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp127
-rw-r--r--src/runtime/CPP/CPPScheduler.cpp2
-rw-r--r--src/runtime/CPP/SingleThreadScheduler.cpp7
-rw-r--r--src/runtime/IScheduler.cpp159
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp150
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp152
-rw-r--r--support/ToolchainSupport.h17
-rw-r--r--tests/networks/AlexNetNetwork.h158
-rw-r--r--tests/validation/fixtures/ConvolutionLayerFixture.h10
35 files changed, 2827 insertions, 225 deletions
diff --git a/SConscript b/SConscript
index 00c366e890..26bcc4a6ec 100644
--- a/SConscript
+++ b/SConscript
@@ -169,6 +169,9 @@ if env['neon']:
core_files += Glob('src/core/NEON/*.cpp')
core_files += Glob('src/core/NEON/kernels/*.cpp')
+ if "arm64-v8" in env['arch']:
+ core_files += Glob('src/core/NEON/kernels/arm64/*.cpp')
+
runtime_files += Glob('src/runtime/NEON/*.cpp')
runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index adad00f8c4..cff49db0ac 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -48,11 +48,18 @@ enum class CPUTarget
A75_DOT = (A75 | DOT),
};
+struct CPUInfo
+{
+ CPUTarget CPU{ CPUTarget::INTRINSICS };
+ int L1_size{ 0 };
+ int L2_size{ 0 };
+};
+
struct ThreadInfo
{
- int thread_id{ 0 };
- int num_threads{ 1 };
- CPUTarget cpu{ CPUTarget::INTRINSICS };
+ int thread_id{ 0 };
+ int num_threads{ 1 };
+ CPUInfo cpu_info{};
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_CPP_TYPES_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
new file mode 100644
index 0000000000..e298bfdebd
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__
+#define __ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AssemblyBase/armv7a NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMAssemblyBaseKernel : public INEKernel
+{
+public:
+ /** Constructor */
+ NEGEMMAssemblyBaseKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _transform_0(true), _transform_1(true)
+ {
+ }
+
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
+
+ virtual ~NEGEMMAssemblyBaseKernel() = default;
+
+ /** Initialise the kernel's input and output.
+ *
+ * The computed function is C = a * AxB + b * C.
+ *
+ * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32
+ * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0
+ * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
+ * @param[out] workspace Space for intermediate results.
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of the accumulation.
+ * @param[in] transform_0 If true the kernel will transform @p input0 prior to the multiplication.
+ * @param[in] transform_1 If true the kernel will transform @p input1 prior to the multiplication.
+ */
+ void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool transform_0 = true, bool transform_1 = true)
+ {
+ internal_configure(input0, input1, output, workspace, alpha, beta, transform_0, transform_1);
+ }
+
+protected:
+ virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) = 0;
+
+ const ITensor *_input0;
+ const ITensor *_input1;
+ ITensor *_output;
+ ITensor *_workspace;
+ float _alpha;
+ float _beta;
+ bool _transform_0;
+ bool _transform_1;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
new file mode 100644
index 0000000000..77431d2bc8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
new file mode 100644
index 0000000000..fa1d6e37a9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+// Macro to use in assembler to get a preload. Needed because of various
+// workarounds needed to get working preload behaviour.
+//
+// Code using these macros needs to clobber x20 and x21 as they might be
+// used by the workaround.
+
+#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+
+#else
+
+#define ASM_PREFETCH(address) "PLD " address "\n"
+#define ASM_PREFETCHW(address) "PLDW " address "\n"
+
+#endif
+
+/*
+ * Do some prefetches.
+ */
+template <typename T>
+static inline void prefetch_6x(const T *pfp) {
+ __asm __volatile (
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ ASM_PREFETCH("[%[pfp], #192]")
+ ASM_PREFETCH("[%[pfp], #256]")
+ ASM_PREFETCH("[%[pfp], #320]")
+ :
+ : [pfp] "r" (pfp)
+ : "memory"
+ );
+}
+
+template <typename T>
+static inline void prefetch_5x(const T *pfp) {
+ __asm __volatile (
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ ASM_PREFETCH("[%[pfp], #192]")
+ ASM_PREFETCH("[%[pfp], #256]")
+ :
+ : [pfp] "r" (pfp)
+ : "memory"
+ );
+}
+
+template <typename T>
+static inline void prefetch_4x(const T *pfp) {
+ __asm __volatile (
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ ASM_PREFETCH("[%[pfp], #192]")
+ :
+ : [pfp] "r" (pfp)
+ : "memory"
+ );
+}
+
+template <typename T>
+static inline void prefetch_3x(const T *pfp) {
+ __asm __volatile (
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ ASM_PREFETCH("[%[pfp], #128]")
+ :
+ : [pfp] "r" (pfp)
+ : "memory"
+ );
+}
+
+template <typename T>
+static inline void prefetch_2x(const T *pfp) {
+ __asm __volatile (
+ ASM_PREFETCH("[%[pfp]]")
+ ASM_PREFETCH("[%[pfp], #64]")
+ :
+ : [pfp] "r" (pfp)
+ : "memory"
+ );
+}
+
+template <typename T>
+static inline void prefetch_1x(const T *pfp) {
+ __asm __volatile (
+ ASM_PREFETCH("[%[pfp]]")
+ :
+ : [pfp] "r" (pfp)
+ : "memory"
+ );
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
new file mode 100644
index 0000000000..00974436ff
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+// Abstract class for a GEMM function
+template<typename To, typename Tr>
+class GemmCommon {
+public:
+ virtual size_t get_working_size() const = 0;
+ virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space = NULL) const = 0;
+ virtual ~GemmCommon() { }
+};
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
new file mode 100644
index 0000000000..a186d88355
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "gemm_common.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+#include "mergeresults.hpp"
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int M;
+ const unsigned int N;
+ const unsigned int K;
+
+ const bool trA;
+ const bool trB;
+
+ const strategy strat;
+
+ unsigned int k_block = 0;
+ unsigned int x_block = 0;
+ unsigned int Mround = 0;
+
+ size_t get_a_working_size() const {
+ return ROUND_UP(sizeof(Toi) * k_block * Mround);
+ }
+
+ size_t get_b_working_size() const {
+ return ROUND_UP(sizeof(Toi) * x_block * k_block);
+ }
+
+ size_t get_c_working_size() const {
+ return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
+ }
+
+public:
+ size_t get_working_size() const override {
+ return get_a_working_size() + get_b_working_size() + get_c_working_size();
+ }
+
+ GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
+ const unsigned int L1_size = ci->L1_size;
+ const unsigned int L2_size = ci->L2_size;
+
+ // Work out blocking parameters
+ // k_block: Each iteration will consume (out_width + out_height)
+ // operands - so how many iterations will fill the L1?
+ k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));
+
+ // Needs to be a multiple of the K unroll level.
+ k_block /= strat.k_unroll;
+ k_block *= strat.k_unroll;
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ int num_k_blocks = (K + (k_block - 1)) / k_block;
+
+ // So divide the space equally into that many blocks.
+ k_block = (K + num_k_blocks - 1) / num_k_blocks;
+
+ // And round UP to the K unroll level required.
+ k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
+ k_block *= strat.k_unroll;
+
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ x_block = L2_size / (sizeof(Toi) * k_block);
+
+ // Needs to be a multiple of the kernel output width.
+ x_block /= strat.out_width;
+ x_block *= strat.out_width;
+
+ // And tune to the presented problem size.
+ int num_x_blocks = (N + (x_block - 1)) / x_block;
+ x_block = (N + num_x_blocks - 1) / num_x_blocks;
+
+ x_block = (x_block + strat.out_width - 1) / strat.out_width;
+ x_block *= strat.out_width;
+
+ // Work out the rounded size of M - needed for some buffers.
+ Mround = (M + (strat.out_height - 1)) / strat.out_height;
+ Mround *= strat.out_height;
+ }
+
+ // Actually execute the GEMM.
+ void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
+ profiler prof;
+
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+ intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
+ size_t diff = 0;
+
+ if (working_space_int & 0xF) {
+ diff = 0x10 - (working_space_int & 0xF);
+ }
+
+ // TODO: Multithreaded implementations could share the burden of transforming these blocks.
+ Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
+ Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
+ Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
+
+ for (unsigned int k0=0; k0<K; k0 += k_block) {
+ unsigned int kmax = k0 + k_block;
+ if (kmax > K) kmax = K;
+
+ // Figure out how many "K" the kernel will actually process.
+ int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
+ kern_k *= strat.k_unroll;
+
+ prof(PROFILE_PREPA, [&](void) {
+ if (trA ^ strategy::A_transpose) {
+ Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
+ } else {
+ Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
+ }
+ });
+
+ for (unsigned int x0=0; x0<N; x0 += x_block) {
+ unsigned int xmax = x0 + x_block;
+ if (xmax > N) xmax = N;
+
+ int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
+
+ prof(PROFILE_PREPB, [&](void) {
+ if (trB ^ strategy::B_transpose) {
+ Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
+ } else {
+ Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
+ }
+ });
+
+ for (unsigned int y=0; y<M; y+=strat.out_height) {
+ unsigned int ymax = y + strat.out_height;
+ if (ymax > M) ymax = M;
+
+ prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
+ prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
+ }
+ }
+ }
+ }
+};
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
new file mode 100644
index 0000000000..e229e215ef
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Actual kernel implementations
+#include "a64_sgemm_12x8/generic.hpp"
+#include "a64_sgemm_12x8/a53.hpp"
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_12x8 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 1;
+
+ kern_type kernel{nullptr};
+
+ sgemm_12x8(const CPUInfo *ci) {
+ kernel = a64_sgemm_asimd_12x8;
+ if (ci->CPU == CPUTarget::A53) {
+ kernel = a64_sgemm_asimd_12x8_a53;
+ }
+ }
+};
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
new file mode 100644
index 0000000000..e58ce66825
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K+1)/2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile (
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+ "nop\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+ "nop\n"
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+
+ "nop\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+ "bne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+ "cbnz %[oddk], 2f\n"
+
+ // Detached final iteration. (even K)
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Detached final iteration. (odd K)
+ "2:\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+ );
+ }
+ }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
new file mode 100644
index 0000000000..082c200646
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <arm_neon.h>
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K+1)/2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile (
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[block_jump]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+ );
+ }
+ }
+}
+
+inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
new file mode 100644
index 0000000000..6731480fca
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template<unsigned int width, unsigned int height, typename Tin, typename Tout>
+void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) {
+ int full_y_blocks = (ymax - y0) / height;
+ int y_remainder = (ymax - y0) % height;
+ int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
+
+ int full_x_blocks = (xmax - x0) / width;
+ int x_remainder = (xmax - x0) % width;
+ int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
+
+ for (int y_block = 0; y_block < y_blocks; y_block++) {
+ int ybase = y0 + (y_block * height);
+
+ int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
+
+ for (int x_block = 0; x_block < x_blocks; x_block++) {
+ int xbase = x0 + (x_block * width);
+
+ int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
+
+ for (int row=0; row < fill_rows; row++) {
+ for (int col=0; col < fill_cols; col++) {
+ Tout &p = out[(ybase + row) * ldc + xbase + col];
+
+ p = (p * alpha) + (beta * in[row * width + col]);
+ }
+ }
+
+ in += (width * height);
+ }
+ }
+}
+
+#include "merges/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000000..f2c5fd86b9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../asmlib.hpp"
+
+template<>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for (int y=y0; y<ymax; y+=8) {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for (int i=x0; i<xmax; i+=12) {
+ float dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if ((y+7) >= ymax) {
+ switch ((y + 7) - ymax) {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ default:
+ break;
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if ((i+11) >= xmax) {
+ for (int xi=0; xi<12; xi++) {
+ if ((i+xi) < xmax) {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ // Rows 0-1
+ "LDP q16, q17, [%[outptr0]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr0], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr1]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr1], #32]\n"
+ ASM_PREFETCH("[%[inptr], #768]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[inptr], #832]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr0]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr0]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[inptr], #896]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr1]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr1]], #16\n"
+
+ // Rows 2-3
+ "LDP q16, q17, [%[outptr2]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr2], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr3]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr3], #32]\n"
+ ASM_PREFETCH("[%[inptr], #960]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[inptr], #1024]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr2]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr2]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[inptr], #1088]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr3]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr3]], #16\n"
+
+ // Rows 4-5
+ ASM_PREFETCH("[%[outptr0], #80]")
+ "LDP q16, q17, [%[outptr4]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr4], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr5]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr5], #32]\n"
+ ASM_PREFETCH("[%[outptr1], #80]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[outptr2], #80]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr4]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr4]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[outptr3], #80]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr5]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr5]], #16\n"
+
+ // Rows 6-7
+ ASM_PREFETCH("[%[outptr4], #80]")
+ "LDP q16, q17, [%[outptr6]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr6], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr7]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr7], #32]\n"
+ ASM_PREFETCH("[%[outptr5], #80]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[outptr6], #128]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr6]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr6]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ ASM_PREFETCH("[%[outptr7], #128]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr7]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr7]], #16\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+ [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [av] "w" (av), [bv] "w" (bv)
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", "q18", "q19", "q20", "q21"
+ );
+ }
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
new file mode 100644
index 0000000000..4f23333ef1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "a64_merge_float_12x8.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
new file mode 100644
index 0000000000..d2f8ba923a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef CYCLE_PROFILING
+
+#include "../perf.h"
+
+class profiler {
+private:
+ static const int maxevents = 10000;
+ unsigned long times[maxevents];
+ int events[maxevents];
+ int currentevent;
+ int countfd;
+
+public:
+ profiler() {
+ currentevent=0;
+ countfd=open_cycle_counter();
+ }
+
+ ~profiler() {
+ close(countfd);
+ int tots[5];
+ unsigned long counts[5];
+ const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+
+ for (int i=1; i<5; i++) {
+ tots[i] = 0;
+ counts[i] = 0;
+ }
+
+ printf("Profiled events:\n");
+ for (int i=0; i<currentevent; i++) {
+ printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+ tots[events[i]]++;
+ counts[events[i]] += times[i];
+ }
+
+ printf("%20s %9s %9s %9s\n", "", "Events", "Total", "Average");
+ for (int i=1; i<5; i++) {
+ printf("%20s: %9d %9ld %9ld\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i]);
+ }
+ }
+
+ template <typename T>
+ void operator() (int i, T func) {
+ if (currentevent==maxevents) {
+ func();
+ } else {
+ start_counter(countfd);
+ func();
+ long long cycs = stop_counter(countfd);
+ events[currentevent] = i;
+ times[currentevent++] = cycs;
+ }
+ }
+};
+
+#else
+
+class profiler {
+public:
+ template <typename T>
+ void operator() (int i, T func) {
+ func();
+ }
+};
+
+#endif
+
+#define PROFILE_PREPA 1
+#define PROFILE_PREPB 2
+#define PROFILE_KERNEL 3
+#define PROFILE_MERGE 4
diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp
new file mode 100644
index 0000000000..717506f54c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transform.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row. This same number of values
+ * are then read from the next <IntBy-1> rows. Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
+struct TransformImpl {
+ template <typename TOut, typename TIn>
+ static void Transform(TOut* out, const TIn* const in, const int stride,
+ const int y0, const int ymax, const int x0, const int xmax) {
+ const int n_whole_y_blocks = (ymax - y0) / IntBy;
+ const int y_remainders = (ymax - y0) % IntBy;
+ const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+ const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+ const int x_remainders = (xmax - x0) % BlockBy;
+ const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+ // "Y" loop: advance down the rows of the source IntBy rows at a time.
+ // Set up fill_rows to show the number rows to copy from, and blank_rows
+ // for the number of blank rows to add.
+ for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
+ int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+ int blank_rows = IntBy - fill_rows;
+
+ int y_base = y0 + (y_block * IntBy);
+
+ // So now advance along this block of rows, BlockBy columns at a time.
+ for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
+ int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+ int blank_cols = BlockBy - fill_cols;
+
+ int x_base = x0 + (x_block * BlockBy);
+
+ for (int row = 0; row < fill_rows; row++) {
+ for (int col = 0; col < fill_cols; col++) {
+ // In-range copy. If it's transposed, we reverse the sense of rows and columns here.
+ if (Transposed) {
+ *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+ } else {
+ *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+ }
+ }
+ // "col" tail - row is in range but column is out of range.
+ for (int col=0; col < blank_cols; col++) {
+ *out++ = static_cast<TOut>(0);
+ }
+ }
+ // "row" tail - row is out of range so fill with zeros always.
+ for (int row = 0; row < blank_rows; row++) {
+ for (int col=0; col < (fill_cols + blank_cols); col++) {
+ *out++ = static_cast<TOut>(0);
+ }
+ }
+ }
+ }
+ }
+
+ template <typename T>
+ static inline void Transform(T* out, const T* const in, const int stride,
+ const int k0, const int kmax, const int x0, const int xmax) {
+ Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+ }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
+void Transform(
+ TOut* out, const TIn* const in, const int stride,
+ const int k0, const int kmax, const int x0, const int xmax
+) {
+ // Redirect to a specialised implementation predicated on argument size.
+ TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+ out, in, stride, k0, kmax, x0, xmax
+ );
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
new file mode 100644
index 0000000000..6317424598
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../asmlib.hpp"
+
+#include <arm_neon.h>
+
+template<>
+template<typename T>
+void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+ uint32_t *outptr = (uint32_t *)out;
+ const uint32_t *inptr = (uint32_t *)in;
+
+ uint32_t zerobuff[8];
+
+ for (int y=y0; y<ymax; y+=8) {
+ const uint32_t *inptr0 = inptr + y * ldin + k0;
+ const uint32_t *inptr1 = inptr0 + ldin;
+ const uint32_t *inptr2 = inptr1 + ldin;
+ const uint32_t *inptr3 = inptr2 + ldin;
+ const uint32_t *inptr4 = inptr3 + ldin;
+ const uint32_t *inptr5 = inptr4 + ldin;
+ const uint32_t *inptr6 = inptr5 + ldin;
+ const uint32_t *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x=(kmax-k0);
+ for (;x>7;x-=8) {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if ((y + 7) >= ymax) {
+ switch ((y + 7) - ymax) {
+ /* Everything falls through in here */
+ case 6:
+ inptr1 = zerobuff;
+ case 5:
+ inptr2 = zerobuff;
+ case 4:
+ inptr3 = zerobuff;
+ case 3:
+ inptr4 = zerobuff;
+ case 2:
+ inptr5 = zerobuff;
+ case 1:
+ inptr6 = zerobuff;
+ case 0:
+ inptr7 = zerobuff;
+ default:
+ break;
+ }
+ }
+
+ __asm __volatile (
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
+ "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
+ "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
+ "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
+ "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+ "LDP q8, q9, [%[inptr4]], #32\n"
+ "LDP q10, q11, [%[inptr5]], #32\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
+ "LDP q12, q13, [%[inptr6]], #32\n"
+ "ZIP1 v18.4s, v8.4s, v12.4s\n"
+ "LDP q14, q15, [%[inptr7]], #32\n"
+ "ZIP1 v19.4s, v10.4s, v14.4s\n"
+
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+ ASM_PREFETCH("[%[inptr3], #128]")
+
+ "ZIP2 v16.4s, v0.4s, v4.4s\n"
+ "ZIP2 v17.4s, v2.4s, v6.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+ "ZIP2 v18.4s, v8.4s, v12.4s\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP2 v19.4s, v10.4s, v14.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP1 v16.4s, v1.4s, v5.4s\n"
+ "ZIP1 v17.4s, v3.4s, v7.4s\n"
+ ASM_PREFETCH("[%[inptr6], #128]")
+ "STP q20, q21, [%[outptr]], #32\n" // Third element
+
+ "ZIP1 v18.4s, v9.4s, v13.4s\n"
+ "ZIP1 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Fourth element
+ ASM_PREFETCH("[%[inptr7], #128]")
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP2 v16.4s, v1.4s, v5.4s\n"
+ "ZIP2 v17.4s, v3.4s, v7.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Fifth element
+
+ "ZIP2 v18.4s, v9.4s, v13.4s\n"
+ "ZIP2 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Sixth element
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Seventh element
+
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Eighth element
+ : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+ [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ );
+ }
+
+ for (;x>0;x--) {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
new file mode 100644
index 0000000000..3cf6b41ffa
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+//#include "a32_interleave_6way_32bit.hpp"
+//#include "a32_transpose_interleave_8way_32bit.hpp"
+//#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
+//#include "a64_interleave_8way_half_to_float.hpp"
+//#include "a64_transpose_interleave_12way_16bit.hpp"
+//#include "a64_transpose_interleave_12way_half_to_float.hpp"
+//#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "transpose_interleave_common.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000000..882da9c831
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon {
+ // Override the moveblock_1xY methods to improve performance
+ static inline void moveblock_1x1(const TIn *&in0, TOut *out) {
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ }
+
+ static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) {
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in1++);
+ }
+ }
+
+ static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) {
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in1++);
+ }
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in2++);
+ }
+ for (unsigned int i = 0; i < IntBy; i++) {
+ *out++ = static_cast<TOut>(*in3++);
+ }
+ }
+
+ static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
+ const auto ldin = stride;
+
+ TOut *outarray = out;
+ const TIn *inarray = in;
+ TOut *outptr_base = outarray;
+ const TIn *inptr_base = inarray + x0 + (k0 * ldin);
+ int ldout = (kmax - k0) * IntBy;
+
+ int k=(kmax-k0);
+ for ( ; k>3; k-=4) {
+ TOut *outptr = outptr_base;
+ const TIn *inptr = inptr_base;
+ const TIn *inptr1 = inptr + ldin;
+ const TIn *inptr2 = inptr1 + ldin;
+ const TIn *inptr3 = inptr2 + ldin;
+
+ prefetch_3x(inptr);
+ prefetch_3x(inptr1);
+ prefetch_3x(inptr2);
+ prefetch_3x(inptr3);
+
+ outptr_base += IntBy * 4;
+ inptr_base += ldin * 4;
+
+ for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+ moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+ outptr += ldout;
+ }
+ }
+
+ if (k) {
+ TOut *outptr = outptr_base;
+ const TIn *inptr = inptr_base;
+ const TIn *inptr1 = inptr + ldin;
+ const TIn *inptr2 = inptr1 + ldin;
+
+ prefetch_3x(inptr);
+ prefetch_3x(inptr1);
+ prefetch_3x(inptr2);
+
+ for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+ switch(k) {
+ case 3:
+ moveblock_1x2(inptr, inptr1, outptr);
+ moveblock_1x1(inptr2, outptr + IntBy * 2);
+ break;
+
+ case 2:
+ moveblock_1x2(inptr, inptr1, outptr);
+ break;
+
+ case 1:
+ moveblock_1x1(inptr, outptr);
+ break;
+ default:
+ break;
+ }
+
+ outptr += ldout;
+ }
+ }
+
+ // Cope with ragged X cases
+ const unsigned int overflow = (xmax - x0) % IntBy;
+ if (overflow) {
+ const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+ TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
+
+ for (int k=(kmax-k0); k>0; k--) {
+ const TIn *inptr = inptr_base;
+ inptr_base += ldin;
+
+ for (unsigned int x=0; x < IntBy; x++) {
+ TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+ *outptr++ = val;
+ }
+ }
+ }
+}
+};
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 6078abd06b..8918843c98 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -35,23 +35,23 @@ class IScheduler
{
public:
/** Default constructor. */
- IScheduler()
- : _target(CPUTarget::INTRINSICS)
- {
- }
+ IScheduler();
/** Destructor. */
virtual ~IScheduler() = default;
+
/** Sets the number of threads the scheduler will use to run the kernels.
*
* @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
*/
virtual void set_num_threads(unsigned int num_threads) = 0;
+
/** Returns the number of threads that the SingleThreadScheduler has in his pool.
*
* @return Number of threads available in SingleThreadScheduler.
*/
virtual unsigned int num_threads() const = 0;
+
/** Runs the kernel in the same thread as the caller synchronously.
*
* @param[in] kernel Kernel to execute.
@@ -65,24 +65,14 @@ public:
*/
void set_target(CPUTarget target);
- /** Return the current CPU target.
+ /** Get CPU info.
*
- * @return Target CPU.
+ * @return CPU info.
*/
- CPUTarget target() const;
+ CPUInfo cpu_info() const;
protected:
- CPUTarget _target;
+ CPUInfo _info{};
};
-
-inline void IScheduler::set_target(CPUTarget target)
-{
- _target = target;
-}
-
-inline CPUTarget IScheduler::target() const
-{
- return _target;
-}
}
#endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index 8e040b3055..893dfa0f9d 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -28,6 +28,7 @@
#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
@@ -37,6 +38,8 @@
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
+#include <memory>
+
namespace arm_compute
{
class ITensor;
@@ -59,6 +62,7 @@ public:
* Data types supported: Same as @p weights.
*/
void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+
// Inherited methods overridden:
void run() override;
@@ -82,6 +86,7 @@ class NEConvolutionLayer : public IFunction
public:
/** Constructor */
NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
/** Set the input and output tensors.
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -96,23 +101,26 @@ public:
* tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
*/
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
+
// Inherited methods overridden:
void run() override;
private:
- MemoryGroup _memory_group;
- NEIm2ColKernel _input_im2col_kernel;
- NEGEMMInterleave4x4Kernel _input_interleave_kernel;
- NEConvolutionLayerReshapeWeights _reshape_weights;
- NEGEMMMatrixMultiplyKernel _mm_kernel;
- NECol2ImKernel _output_col2im_kernel;
- Tensor _input_im2col_reshaped;
- Tensor _input_interleaved_reshaped;
- Tensor _weights_reshaped;
- Tensor _gemm_output;
- bool _has_bias;
- bool _is_fully_connected_convolution;
- bool _are_weights_reshaped;
+ MemoryGroup _memory_group;
+ NEIm2ColKernel _input_im2col_kernel;
+ NEGEMMInterleave4x4Kernel _input_interleave_kernel;
+ NEConvolutionLayerReshapeWeights _reshape_weights;
+ NEGEMMMatrixMultiplyKernel _mm_kernel;
+ std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
+ NECol2ImKernel _output_col2im_kernel;
+ Tensor _input_im2col_reshaped;
+ Tensor _input_interleaved_reshaped;
+ Tensor _weights_reshaped;
+ Tensor _gemm_output;
+ Tensor _workspace;
+ bool _has_bias;
+ bool _is_fully_connected_convolution;
+ bool _are_weights_reshaped;
};
}
#endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index b4b9e8be01..068e7c5ce8 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -25,6 +25,7 @@
#define __ARM_COMPUTE_NEGEMM_H__
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
@@ -51,6 +52,7 @@ class NEGEMM : public IFunction
public:
/** Constructor */
NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
/** Initialise the kernel's inputs, output
*
* @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -69,15 +71,17 @@ public:
void run() override;
private:
- MemoryGroup _memory_group;
- NEGEMMInterleave4x4Kernel _interleave_kernel;
- NEGEMMTranspose1xWKernel _transpose_kernel;
- NEGEMMMatrixMultiplyKernel _mm_kernel;
- NEGEMMMatrixAdditionKernel _ma_kernel;
- Tensor _tmp_a;
- Tensor _tmp_b;
- bool _run_vector_matrix_multiplication;
- bool _run_addition;
+ MemoryGroup _memory_group;
+ NEGEMMInterleave4x4Kernel _interleave_kernel;
+ NEGEMMTranspose1xWKernel _transpose_kernel;
+ NEGEMMMatrixMultiplyKernel _mm_kernel;
+ std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
+ NEGEMMMatrixAdditionKernel _ma_kernel;
+ Tensor _tmp_a;
+ Tensor _tmp_b;
+ Tensor _workspace;
+ bool _run_vector_matrix_multiplication;
+ bool _run_addition;
};
}
#endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/scripts/add_copyright.py b/scripts/add_copyright.py
index 0c5b8f00cf..a9d4929db8 100755
--- a/scripts/add_copyright.py
+++ b/scripts/add_copyright.py
@@ -71,7 +71,7 @@ for top in ['./arm_compute', './tests','./src','./examples','./utils/','./framew
content = fd.read()
_, extension = os.path.splitext(f)
- if extension in ['.cpp', '.h', '.inl', '.cl']:
+ if extension in ['.cpp', '.h', '.hpp', '.inl', '.cl']:
if not content.startswith('/*'):
add_cpp_copyright(path, content)
elif extension == '.py' or f in ['SConstruct', 'SConscript']:
diff --git a/scripts/check_bad_style.sh b/scripts/check_bad_style.sh
index 386824015f..ab2b1a016d 100755
--- a/scripts/check_bad_style.sh
+++ b/scripts/check_bad_style.sh
@@ -5,7 +5,7 @@ set -e
DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils ./support"
-grep -HrnP "/\*\*$" $DIRECTORIES | tee bad_style.log
+grep -HrnP --exclude-dir=assembly "/\*\*$" $DIRECTORIES | tee bad_style.log
if (( `cat bad_style.log | wc -l` > 0 ))
then
echo ""
@@ -13,7 +13,7 @@ then
exit -1
fi
-grep -Hnr --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
+grep -Hnr --exclude-dir=assembly --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
if (( `cat bad_style.log | wc -l` > 0 ))
then
echo ""
@@ -21,7 +21,7 @@ then
exit -1
fi
-grep -HnRE "\buint " --exclude-dir=cl_kernels $DIRECTORIES | tee bad_style.log
+grep -HnRE --exclude-dir=assembly "\buint " --exclude-dir=cl_kernels $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -29,7 +29,7 @@ then
exit -1
fi
-grep -HnR "float32_t" $DIRECTORIES | tee bad_style.log
+grep -HnR --exclude-dir=assembly "float32_t" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -37,7 +37,7 @@ then
exit -1
fi
-grep -Hnir "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -45,7 +45,7 @@ then
exit -1
fi
-grep -Hnir "#.*defined[^(]" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "#.*defined[^(]" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -53,7 +53,7 @@ then
exit -1
fi
-grep -Hnir "#else$\|#endif$" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "#else$\|#endif$" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -61,7 +61,7 @@ then
exit -1
fi
-grep -Hnir "ARM_COMPUTE_ENABLE_FP16" ./tests/validation/CL | tee bad_style.log
+grep -Hnir --exclude-dir=assembly "ARM_COMPUTE_ENABLE_FP16" ./tests/validation/CL | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index c3faf8736c..900413c90b 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -42,12 +42,16 @@ def filter_clang_tidy_lines( lines ):
for i in range(0, len(lines)):
line = lines[i]
+ if "/assembly/" in line:
+ continue
+
if "error:" in line:
if (("Utils.cpp" in line and "'arm_compute_version.embed' file not found" in line) or
("cl2.hpp" in line and "cast from pointer to smaller type 'cl_context_properties' (aka 'int') loses information" in line) or
("arm_fp16.h" in line) or
("omp.h" in line) or
- ("memory" in line and "cast from pointer to smaller type 'uintptr_t' (aka 'unsigned int') loses information" in line) or
+ ("cast from pointer to smaller type 'uintptr_t' (aka 'unsigned int') loses information" in line) or
+ ("cast from pointer to smaller type 'std::uintptr_t' (aka 'unsigned int') loses information" in line) or
("NEMath.inl" in line and "statement expression not allowed at file scope" in line) or
("Utils.h" in line and "no member named 'unmap' in 'arm_compute::Tensor'" in line) or
("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or
diff --git a/scripts/fix_code_formatting.sh b/scripts/fix_code_formatting.sh
index ccda38abab..a07d2615af 100755
--- a/scripts/fix_code_formatting.sh
+++ b/scripts/fix_code_formatting.sh
@@ -28,6 +28,11 @@ else
fi
for f in $files
do
+ if [[ $f == *"/assembly/"* ]]
+ then
+ continue
+ fi
+
sed -i 's/\t/ /g' $f
clang-format -i -style=file $f
astyle -n -q $ASTYLE_PARAMETERS $f
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index c76c39aa4b..ae5d456141 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -85,10 +85,10 @@ void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const
const uint16x4x4_t data =
{
{
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 0 * in_stride)),
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 1 * in_stride)),
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 2 * in_stride)),
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 3 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
}
};
vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
@@ -113,10 +113,10 @@ void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const
const uint32x4x4_t data =
{
{
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 0 * in_stride)),
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 1 * in_stride)),
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 2 * in_stride)),
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 3 * in_stride))
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
}
};
vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
new file mode 100644
index 0000000000..d70524b6b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _transform_0 = transform_0;
+ _transform_1 = transform_1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info());
+
+ AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
+
+ const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
+ const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+
+ update_window_and_padding(win,
+ AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+ AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+ output_access);
+
+ INEKernel::configure(win);
+}
+
+void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+ const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+ const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+ const int N = _output->info()->tensor_shape().x();
+ const int K = _input0->info()->tensor_shape().x();
+
+ // Only iterate over batches
+ Window win(window);
+ win.set(0, Window::Dimension(0, 1, 1));
+ win.set(1, Window::Dimension(0, 1, 1));
+
+ Iterator in0(_input0, window);
+ Iterator out(_output, window);
+
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ constexpr size_t alignment = 4096;
+ const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+ void *workspace = _workspace->buffer() + offset;
+ size_t workspace_size = _workspace->info()->total_size();
+
+ if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+ {
+ ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+ }
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+ reinterpret_cast<const float *>(in1_ptr), ldb,
+ reinterpret_cast<float *>(out.ptr()), ldc,
+ _alpha, _beta, workspace);
+ },
+ in0, out);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 77aa044144..a83a0bc0d3 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -178,7 +178,7 @@ void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
/** [Scheduler example] */
ThreadInfo info;
- info.cpu = _target;
+ info.cpu_info = _info;
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 4e46a59fd0..c8285b43a7 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -27,8 +27,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Utils.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
SingleThreadScheduler &SingleThreadScheduler::get()
{
static SingleThreadScheduler scheduler;
@@ -45,7 +45,7 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dime
{
ARM_COMPUTE_UNUSED(split_dimension);
ThreadInfo info;
- info.cpu = _target;
+ info.cpu_info = cpu_info();
kernel->run(kernel->window(), info);
}
@@ -53,3 +53,4 @@ unsigned int SingleThreadScheduler::num_threads() const
{
return 1;
}
+} // namespace arm_compute
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
new file mode 100644
index 0000000000..1745764bbb
--- /dev/null
+++ b/src/runtime/IScheduler.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace
+{
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+ int fd = open("/proc/cpuinfo", 0); // NOLINT
+ std::array<char, 1200> buff{ {} };
+ char *pos = nullptr;
+ char *end = nullptr;
+ bool foundid = false;
+
+ int cpu = sched_getcpu();
+
+ if(fd == -1)
+ {
+ return 0;
+ }
+
+ int charsread = read(fd, buff.data(), 1200);
+ pos = buff.data();
+ end = buff.data() + charsread;
+
+ close(fd);
+
+ /* So, to date I've encountered two formats for /proc/cpuinfo.
+ *
+ * One of them just lists processor : n for each processor (with no
+ * other info), then at the end lists part information for the current
+ * CPU.
+ *
+ * The other has an entire clause (including part number info) for each
+ * CPU in the system, with "processor : n" headers.
+ *
+ * We can cope with either of these formats by waiting to see
+ * "processor: n" (where n = our CPU ID), and then looking for the next
+ * "CPU part" field.
+ */
+ while(pos < end)
+ {
+ if(foundid && strncmp(pos, "CPU part", 8) == 0)
+ {
+ /* Found part number */
+ pos += 11;
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ break;
+ }
+ }
+
+ return strtoul(pos, nullptr, 0);
+ }
+
+ if(strncmp(pos, "processor", 9) == 0)
+ {
+ /* Found processor ID, see if it's ours. */
+ pos += 11;
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ break;
+ }
+ }
+
+ int num = strtol(pos, nullptr, 0);
+
+ if(num == cpu)
+ {
+ foundid = true;
+ }
+ }
+
+ while(pos < end)
+ {
+ char ch = *pos++;
+ if(ch == '\n' || ch == '\0')
+ {
+ break;
+ }
+ }
+ }
+#endif /* BARE_METAL */
+
+ return 0;
+}
+} // namespace
+
+namespace arm_compute
+{
+IScheduler::IScheduler()
+{
+ switch(get_cpu_impl())
+ {
+ case 0xd03:
+ _info.CPU = CPUTarget::A53;
+ break;
+ default:
+#ifdef __aarch64__
+ _info.CPU = CPUTarget::ARMV8;
+#else /* __aarch64__ */
+ _info.CPU = CPUTarget::INTRINSICS;
+#endif /* __aarch64__ */
+ break;
+ }
+
+ _info.L1_size = 31000;
+ _info.L2_size = 500000;
+}
+
+void IScheduler::set_target(CPUTarget target)
+{
+ _info.CPU = target;
+}
+
+CPUInfo IScheduler::cpu_info() const
+{
+ return _info;
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 0466a4a501..44bf2de70c 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -23,17 +23,25 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
#include <cmath>
#include <tuple>
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
@@ -69,8 +77,10 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I
_weights_reshaped.allocator()->init(info_wr);
_memory_group.manage(&_weights_reshaped);
+
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_weights_transposed_kernel.configure(&_weights_reshaped, output);
+
_weights_reshaped.allocator()->allocate();
}
else
@@ -84,6 +94,7 @@ void NEConvolutionLayerReshapeWeights::run()
_memory_group.acquire();
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
if(_transpose1xW)
{
NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
@@ -93,8 +104,8 @@ void NEConvolutionLayerReshapeWeights::run()
}
NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
- _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
+ _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
{
}
@@ -137,45 +148,72 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
conv_info);
- // Check if its a "fully connected" convolution
+ // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
_is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+#if defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+ }
+#endif /* defined(__aarch64__) */
+
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
// Reshape weights if needed
- if(_are_weights_reshaped)
+ if(_mm_optimised_kernel != nullptr)
{
- mat_weights_cols = weights_info.num_kernels();
- const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
- mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+ if(_are_weights_reshaped)
+ {
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(1);
+ }
+ else
+ {
+ TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+ // Create tensor to store the reshaped weights
+ _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ weights = &_weights_reshaped;
+ }
}
else
{
- if(_is_fully_connected_convolution)
+ if(_are_weights_reshaped)
{
- // Create tensor to store the reshaped weights
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
- _weights_reshaped.allocator()->init(info_wr);
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
}
else
{
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _weights_reshaped.allocator()->init(info_wt);
- _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+ TensorShape reshaped_weights_shape;
+
+ if(_is_fully_connected_convolution)
+ {
+ reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->info()->element_size();
+ reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+ static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+ }
+
+ // Create tensor to store the reshaped weights
+ _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, !_is_fully_connected_convolution /* 1xW transpose */);
+ weights = &_weights_reshaped;
}
- weights = &_weights_reshaped;
}
// Create tensor to store im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
const unsigned int mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
+
+ TensorShape shape_im2col(input->info()->tensor_shape());
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
@@ -185,7 +223,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
// Create tensor (interleave) to prepare input tensor for GEMM
if(!_is_fully_connected_convolution)
{
- TensorShape shape_interleaved = shape_im2col;
+ TensorShape shape_interleaved(shape_im2col);
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
_input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
@@ -193,7 +231,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
}
// Create GEMM output tensor
- TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
@@ -201,16 +239,49 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
- if(_is_fully_connected_convolution)
+
+#if defined(__aarch64__)
+ if(_mm_optimised_kernel != nullptr)
{
- _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+ const int M = _gemm_output.info()->tensor_shape().y();
+ const int N = _gemm_output.info()->tensor_shape().x();
+ const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+
+ constexpr size_t alignment = 4096;
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+
+ // Configure matrix multiplication kernel
+ if(_is_fully_connected_convolution)
+ {
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+ }
+ else
+ {
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
+ }
+
+ _workspace.allocator()->allocate();
}
else
+#endif /* defined(__aarch64__) */
{
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
- _input_interleaved_reshaped.allocator()->allocate();
+ if(_is_fully_connected_convolution)
+ {
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ }
+ else
+ {
+ _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+ _input_interleaved_reshaped.allocator()->allocate();
+ }
}
+
_input_im2col_reshaped.allocator()->allocate();
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
_gemm_output.allocator()->allocate();
@@ -237,17 +308,26 @@ void NEConvolutionLayer::run()
// Run input reshaping
NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
- if(!_is_fully_connected_convolution)
+
+ // Runs matrix multiply on reshaped matrices
+ if(_mm_optimised_kernel != nullptr)
{
- // Run interleave
- NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+ NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
}
+ else
+ {
+ if(!_is_fully_connected_convolution)
+ {
+ // Run interleave
+ NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+ }
- // Runs matrix multiply on reshaped matrices
- NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+ }
// Reshape output matrix
NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
_memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 85b283cd41..1d6aa65e37 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,18 +26,27 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
-#include <cmath>
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
-using namespace arm_compute;
+#include <cmath>
+namespace arm_compute
+{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+ _run_vector_matrix_multiplication(false), _run_addition(false)
{
}
@@ -57,57 +66,94 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
- // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
- if((a->info()->dimension(1) == 1))
+ _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+
+#if defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
{
- _run_vector_matrix_multiplication = true;
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+ }
+#endif /* defined(__aarch64__) */
+ // Check if the first input tensor is a vector.
+ // If so, all the kernels for reshaping the tensors can be skipped
+ if(_run_vector_matrix_multiplication)
+ {
// Configure the matrix multiply kernel
_mm_kernel.configure(a, b, d, alpha);
+
+ // Configure matrix addition kernel
+ if(beta != 0 && c != nullptr)
+ {
+ _ma_kernel.configure(c, d, beta);
+ _run_addition = true;
+ }
}
else
{
- _run_vector_matrix_multiplication = false;
+#if defined(__aarch64__)
+ if(_mm_optimised_kernel != nullptr)
+ {
+ struct CPUInfo ci = NEScheduler::get().cpu_info();
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- TensorShape shape_tmp_b = b->info()->tensor_shape();
+ const int M = d->info()->tensor_shape().y();
+ const int N = d->info()->tensor_shape().x();
+ const int K = a->info()->tensor_shape().x();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
- const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
- shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+ constexpr size_t alignment = 4096;
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+ // Configure matrix multiplication kernel
+ _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
+ _workspace.allocator()->allocate();
+ }
+ else
+#endif /* defined(__aarch64__) */
+ {
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
- // Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a);
+ const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
+ shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
- // Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b);
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
- // Configure matrix multiplication kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
- // Allocate once the all configure methods have been called
- _tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
- }
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
- // Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
- {
- _ma_kernel.configure(c, d, beta);
- _run_addition = true;
+ // Configure interleave kernel
+ _interleave_kernel.configure(a, &_tmp_a);
+
+ // Configure transpose kernel
+ _transpose_kernel.configure(b, &_tmp_b);
+
+ // Configure matrix multiplication kernel
+ _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+ // Allocate once the all configure methods have been called
+ _tmp_a.allocator()->allocate();
+ _tmp_b.allocator()->allocate();
+
+ // Configure matrix addition kernel
+ if(beta != 0 && c != nullptr)
+ {
+ _ma_kernel.configure(c, d, beta);
+ _run_addition = true;
+ }
+ }
}
}
@@ -115,23 +161,31 @@ void NEGEMM::run()
{
_memory_group.acquire();
- if(!_run_vector_matrix_multiplication)
+ if(_mm_optimised_kernel != nullptr)
{
- // Run interleave kernel
- NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
-
- // Run transpose kernel
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _memory_group.release();
}
+ else
+ {
+ if(!_run_vector_matrix_multiplication)
+ {
+ // Run interleave kernel
+ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
- // Run matrix multiply kernel
- NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+ // Run transpose kernel
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ }
- _memory_group.release();
+ NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
- // Run matrix addition kernel
- if(_run_addition)
- {
- NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+ _memory_group.release();
+
+ // Run matrix addition kernel
+ if(_run_addition)
+ {
+ NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+ }
}
}
+} // namespace arm_compute
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 87e9bd2bc8..b9d9103652 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -268,6 +268,23 @@ inline std::string to_string(bool value)
str << std::boolalpha << value;
return str.str();
}
+
+// std::align is missing in GCC 4.9
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57350
+inline void *align(std::size_t alignment, std::size_t size, void *&ptr, std::size_t &space)
+{
+ std::uintptr_t pn = reinterpret_cast<std::uintptr_t>(ptr);
+ std::uintptr_t aligned = (pn + alignment - 1) & -alignment;
+ std::size_t padding = aligned - pn;
+ if(space < size + padding)
+ {
+ return nullptr;
+ }
+
+ space -= padding;
+
+ return ptr = reinterpret_cast<void *>(aligned);
+}
} // namespace cpp11
namespace cpp14
diff --git a/tests/networks/AlexNetNetwork.h b/tests/networks/AlexNetNetwork.h
index 1e99503792..0c06c1860f 100644
--- a/tests/networks/AlexNetNetwork.h
+++ b/tests/networks/AlexNetNetwork.h
@@ -24,6 +24,7 @@
#ifndef __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
#define __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
#include "tests/AssetsLibrary.h"
@@ -80,55 +81,64 @@ public:
w[7].allocator()->init(TensorInfo(TensorShape(4096U, 1000U), 1, _data_type, _fixed_point_position));
b[7].allocator()->init(TensorInfo(TensorShape(1000U), 1, _data_type, _fixed_point_position));
- w21 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
- w22 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
- b21 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
- b22 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
+ w11 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+ w12 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+ b11 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
+ b12 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
- w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
- w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
- b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
- b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
+ w31 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+ w32 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+ b31 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
+ b32 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
- w51 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
- w52 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
- b51 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
- b52 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
+ w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+ w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+ b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
+ b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
}
else
{
- auto reshape = [&](unsigned int width, unsigned int height) -> TensorShape
+ auto reshape = [&](unsigned int width, unsigned int height, bool convolution_layer) -> TensorShape
{
- const int interleave_width = 16 / arm_compute::data_size_from_type(_data_type);
+ const bool is_optimised = std::is_same<ITensorType, ITensor>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
- return TensorShape{ width * interleave_width, static_cast<unsigned int>(std::ceil(static_cast<float>(height) / interleave_width)) };
+ if(convolution_layer && is_optimised)
+ {
+ return TensorShape{ height, width };
+ }
+ else
+ {
+ const int interleave_width = 16 / arm_compute::data_size_from_type(_data_type);
+
+ return TensorShape{ width * interleave_width, static_cast<unsigned int>(std::ceil(static_cast<float>(height) / interleave_width)) };
+ }
};
// Create tensor for the reshaped weights
- w[0].allocator()->init(TensorInfo(reshape(366U, 96U), 1, _data_type, _fixed_point_position));
+ w[0].allocator()->init(TensorInfo(reshape(366U, 96U, true), 1, _data_type, _fixed_point_position));
// Configure the direct convolution's weights. Direct convolution doesn't need reshape weights
if(!_is_direct_conv)
{
- auto w21_tensor = std::unique_ptr<TensorType>(new TensorType());
- auto w22_tensor = std::unique_ptr<TensorType>(new TensorType());
+ auto w11_tensor = std::unique_ptr<TensorType>(new TensorType());
+ auto w12_tensor = std::unique_ptr<TensorType>(new TensorType());
+ auto w31_tensor = std::unique_ptr<TensorType>(new TensorType());
+ auto w32_tensor = std::unique_ptr<TensorType>(new TensorType());
auto w41_tensor = std::unique_ptr<TensorType>(new TensorType());
auto w42_tensor = std::unique_ptr<TensorType>(new TensorType());
- auto w51_tensor = std::unique_ptr<TensorType>(new TensorType());
- auto w52_tensor = std::unique_ptr<TensorType>(new TensorType());
- w21_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U), 1, _data_type, _fixed_point_position));
- w22_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U), 1, _data_type, _fixed_point_position));
- w41_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U), 1, _data_type, _fixed_point_position));
- w42_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U), 1, _data_type, _fixed_point_position));
- w51_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U), 1, _data_type, _fixed_point_position));
- w52_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U), 1, _data_type, _fixed_point_position));
- w[2].allocator()->init(TensorInfo(reshape(2560U, 384U), 1, _data_type, _fixed_point_position));
- w21 = std::move(w21_tensor);
- w22 = std::move(w22_tensor);
+ w11_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U, true), 1, _data_type, _fixed_point_position));
+ w12_tensor->allocator()->init(TensorInfo(reshape(1248U, 128U, true), 1, _data_type, _fixed_point_position));
+ w31_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U, true), 1, _data_type, _fixed_point_position));
+ w32_tensor->allocator()->init(TensorInfo(reshape(1920U, 192U, true), 1, _data_type, _fixed_point_position));
+ w41_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U, true), 1, _data_type, _fixed_point_position));
+ w42_tensor->allocator()->init(TensorInfo(reshape(1920U, 128U, true), 1, _data_type, _fixed_point_position));
+ w[2].allocator()->init(TensorInfo(reshape(2560U, 384U, true), 1, _data_type, _fixed_point_position));
+ w11 = std::move(w11_tensor);
+ w12 = std::move(w12_tensor);
+ w31 = std::move(w31_tensor);
+ w32 = std::move(w32_tensor);
w41 = std::move(w41_tensor);
w42 = std::move(w42_tensor);
- w51 = std::move(w51_tensor);
- w52 = std::move(w52_tensor);
}
else
{
@@ -140,20 +150,20 @@ public:
b[3].allocator()->init(TensorInfo(TensorShape(384U), 1, _data_type, _fixed_point_position));
w[4].allocator()->init(TensorInfo(TensorShape(3U, 3U, 192U, 256U), 1, _data_type, _fixed_point_position));
b[4].allocator()->init(TensorInfo(TensorShape(256U), 1, _data_type, _fixed_point_position));
- w21 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
- w22 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
- b21 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
- b22 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
-
- w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
- w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
- b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
- b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
-
- w51 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
- w52 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
- b51 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
- b52 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
+ w11 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+ w12 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[1], TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+ b11 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates()));
+ b12 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[1], TensorShape(128U), Coordinates(128)));
+
+ w31 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+ w32 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[3], TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+ b31 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates()));
+ b32 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[3], TensorShape(192U), Coordinates(192)));
+
+ w41 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+ w42 = std::unique_ptr<SubTensorType>(new SubTensorType(&w[4], TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+ b41 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates()));
+ b42 = std::unique_ptr<SubTensorType>(new SubTensorType(&b[4], TensorShape(128U), Coordinates(128)));
}
b[5].allocator()->init(TensorInfo(TensorShape(4096U), 1, _data_type, _fixed_point_position));
@@ -162,9 +172,9 @@ public:
if(_batches > 1 && std::is_same<TensorType, Tensor>::value)
{
- w[5].allocator()->init(TensorInfo(reshape(9216U, 4096U), 1, _data_type, _fixed_point_position));
- w[6].allocator()->init(TensorInfo(reshape(4096U, 4096U), 1, _data_type, _fixed_point_position));
- w[7].allocator()->init(TensorInfo(reshape(4096U, 1000U), 1, _data_type, _fixed_point_position));
+ w[5].allocator()->init(TensorInfo(reshape(9216U, 4096U, false), 1, _data_type, _fixed_point_position));
+ w[6].allocator()->init(TensorInfo(reshape(4096U, 4096U, false), 1, _data_type, _fixed_point_position));
+ w[7].allocator()->init(TensorInfo(reshape(4096U, 1000U, false), 1, _data_type, _fixed_point_position));
}
else
{
@@ -230,8 +240,8 @@ public:
norm1.configure(&act1_out, &norm1_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
pool1.configure(&norm1_out, &pool1_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
// Layer 2
- conv21.configure(pool11_out.get(), w21.get(), b21.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
- conv22.configure(pool12_out.get(), w22.get(), b22.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
+ conv21.configure(pool11_out.get(), w11.get(), b11.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
+ conv22.configure(pool12_out.get(), w12.get(), b12.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U, 5U, 128U));
act2.configure(&conv2_out, &act2_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
norm2.configure(&act2_out, &norm2_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
pool2.configure(&norm2_out, &pool2_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
@@ -240,12 +250,12 @@ public:
conv3.configure(&pool2_out, &w[2], b2, &conv3_out, PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 384U));
act3.configure(&conv3_out, &act3_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
// Layer 4
- conv41.configure(act31_out.get(), w41.get(), b41.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
- conv42.configure(act32_out.get(), w42.get(), b42.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
+ conv41.configure(act31_out.get(), w31.get(), b31.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
+ conv42.configure(act32_out.get(), w32.get(), b32.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 192U));
act4.configure(&conv4_out, &act4_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
// Layer 5
- conv51.configure(act41_out.get(), w51.get(), b51.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
- conv52.configure(act42_out.get(), w52.get(), b52.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
+ conv51.configure(act41_out.get(), w41.get(), b41.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
+ conv52.configure(act42_out.get(), w42.get(), b42.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U, 3U, 128U));
act5.configure(&conv5_out, &act5_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
pool5.configure(&act5_out, &pool5_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
// Layer 6
@@ -291,12 +301,12 @@ public:
if(!_is_direct_conv)
{
- dynamic_cast<TensorType *>(w21.get())->allocator()->allocate();
- dynamic_cast<TensorType *>(w22.get())->allocator()->allocate();
+ dynamic_cast<TensorType *>(w11.get())->allocator()->allocate();
+ dynamic_cast<TensorType *>(w12.get())->allocator()->allocate();
+ dynamic_cast<TensorType *>(w31.get())->allocator()->allocate();
+ dynamic_cast<TensorType *>(w32.get())->allocator()->allocate();
dynamic_cast<TensorType *>(w41.get())->allocator()->allocate();
dynamic_cast<TensorType *>(w42.get())->allocator()->allocate();
- dynamic_cast<TensorType *>(w51.get())->allocator()->allocate();
- dynamic_cast<TensorType *>(w52.get())->allocator()->allocate();
}
else
{
@@ -359,12 +369,12 @@ public:
if(!_is_direct_conv)
{
- library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w21.get())), 9);
- library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w22.get())), 10);
- library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 11);
- library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 12);
- library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w51.get())), 13);
- library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w52.get())), 14);
+ library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w11.get())), 9);
+ library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w12.get())), 10);
+ library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w31.get())), 11);
+ library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w32.get())), 12);
+ library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 13);
+ library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 14);
}
else
{
@@ -481,18 +491,18 @@ public:
}
}
- w21.reset();
- w22.reset();
- b21.reset();
- b21.reset();
+ w11.reset();
+ w12.reset();
+ b11.reset();
+ b11.reset();
+ w31.reset();
+ w32.reset();
+ b31.reset();
+ b32.reset();
w41.reset();
w42.reset();
b41.reset();
b42.reset();
- w51.reset();
- w52.reset();
- b51.reset();
- b52.reset();
conv1_out.allocator()->free();
act1_out.allocator()->free();
@@ -595,9 +605,9 @@ private:
TensorType input{}, output{};
std::array<TensorType, 8> w{ {} }, b{ {} };
- std::unique_ptr<ITensorType> w21{ nullptr }, w22{ nullptr }, b21{ nullptr }, b22{ nullptr };
+ std::unique_ptr<ITensorType> w11{ nullptr }, w12{ nullptr }, b11{ nullptr }, b12{ nullptr };
+ std::unique_ptr<ITensorType> w31{ nullptr }, w32{ nullptr }, b31{ nullptr }, b32{ nullptr };
std::unique_ptr<ITensorType> w41{ nullptr }, w42{ nullptr }, b41{ nullptr }, b42{ nullptr };
- std::unique_ptr<ITensorType> w51{ nullptr }, w52{ nullptr }, b51{ nullptr }, b52{ nullptr };
TensorType conv1_out{}, act1_out{}, norm1_out{}, pool1_out{};
TensorType conv2_out{}, act2_out{}, pool2_out{}, norm2_out{};
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index dd2df727e9..fcaf4ef42b 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "tests/AssetsLibrary.h"
#include "tests/Globals.h"
#include "tests/IAccessor.h"
@@ -39,6 +40,8 @@
namespace arm_compute
{
+class NEConvolutionLayer;
+
namespace test
{
namespace validation
@@ -85,6 +88,8 @@ protected:
{
// Check if its a "fully connected" convolution
const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
+ const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
+
reshaped_weights_shape.collapse(3);
if(bias_shape.total_size() > 0)
@@ -92,7 +97,7 @@ protected:
reshaped_weights_shape.set(0, reshaped_weights_shape.x() + 1);
}
- if(is_fully_connected_convolution)
+ if(is_fully_connected_convolution || is_optimised)
{
const size_t shape_x = reshaped_weights_shape.x();
reshaped_weights_shape.set(0, reshaped_weights_shape.y());
@@ -138,6 +143,7 @@ protected:
if(!reshape_weights)
{
const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
+ const bool is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && data_type == DataType::F32;
TensorShape tmp_weights_shape(weights_shape);
SimpleTensor<T> tmp_weights(tmp_weights_shape, data_type, 1, fixed_point_position);
@@ -149,7 +155,7 @@ protected:
tmp_weights = linearise_weights(tmp_weights, &tmp_bias);
- if(!is_fully_connected_convolution)
+ if(!is_fully_connected_convolution && !is_optimised)
{
// Transpose with interleave
const int interleave_size = 16 / tmp_weights.element_size();