From beabe3bdf47306d0940ddf2ddf52ada6903a0875 Mon Sep 17 00:00:00 2001 From: Moritz Pflanzer Date: Thu, 31 Aug 2017 14:56:32 +0100 Subject: COMPMID-481: Add AArch64 GEMM Change-Id: I34f94f99cb05f0eabafee13c5e623ee779b72360 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/83741 Tested-by: Kaizen Reviewed-by: Anthony Barbier Reviewed-by: Pablo Tello --- arm_compute/core/CPP/CPPTypes.h | 13 +- .../core/NEON/kernels/NEGEMMAssemblyBaseKernel.h | 85 +++++ .../core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h | 44 +++ arm_compute/core/NEON/kernels/assembly/asmlib.hpp | 121 +++++++ .../core/NEON/kernels/assembly/gemm_common.hpp | 33 ++ .../NEON/kernels/assembly/gemm_interleaved.hpp | 176 ++++++++++ .../kernels/assembly/kernels/a64_sgemm_12x8.hpp | 72 ++++ .../assembly/kernels/a64_sgemm_12x8/a53.hpp | 367 +++++++++++++++++++++ .../assembly/kernels/a64_sgemm_12x8/generic.hpp | 358 ++++++++++++++++++++ .../core/NEON/kernels/assembly/mergeresults.hpp | 59 ++++ .../assembly/merges/a64_merge_float_12x8.hpp | 236 +++++++++++++ .../core/NEON/kernels/assembly/merges/list.hpp | 24 ++ .../core/NEON/kernels/assembly/profiler.hpp | 97 ++++++ .../core/NEON/kernels/assembly/transform.hpp | 110 ++++++ .../transforms/a64_interleave_8way_32bit.hpp | 174 ++++++++++ .../core/NEON/kernels/assembly/transforms/list.hpp | 32 ++ .../transforms/transpose_interleave_common.hpp | 139 ++++++++ arm_compute/runtime/IScheduler.h | 26 +- .../runtime/NEON/functions/NEConvolutionLayer.h | 34 +- arm_compute/runtime/NEON/functions/NEGEMM.h | 22 +- 20 files changed, 2179 insertions(+), 43 deletions(-) create mode 100644 arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h create mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h create mode 100644 arm_compute/core/NEON/kernels/assembly/asmlib.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/gemm_common.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/mergeresults.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/merges/list.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/profiler.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/transform.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/list.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp (limited to 'arm_compute') diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index adad00f8c4..cff49db0ac 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -48,11 +48,18 @@ enum class CPUTarget A75_DOT = (A75 | DOT), }; +struct CPUInfo +{ + CPUTarget CPU{ CPUTarget::INTRINSICS }; + int L1_size{ 0 }; + int L2_size{ 0 }; +}; + struct ThreadInfo { - int thread_id{ 0 }; - int num_threads{ 1 }; - CPUTarget cpu{ CPUTarget::INTRINSICS }; + int thread_id{ 0 }; + int num_threads{ 1 }; + CPUInfo cpu_info{}; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CPP_TYPES_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h new file mode 100644 index 0000000000..e298bfdebd --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__ +#define __ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** AssemblyBase/armv7a NEON kernel to multiply two input matrices "A" and "B". */ +class NEGEMMAssemblyBaseKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMAssemblyBaseKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _transform_0(true), _transform_1(true) + { + } + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default; + + virtual ~NEGEMMAssemblyBaseKernel() = default; + + /** Initialise the kernel's input and output. + * + * The computed function is C = a * AxB + b * C. + * + * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32 + * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0 + * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0. + * @param[out] workspace Space for intermediate results. + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the accumulation. + * @param[in] transform_0 If true the kernel will transform @p input0 prior to the multiplication. + * @param[in] transform_1 If true the kernel will transform @p input1 prior to the multiplication. + */ + void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool transform_0 = true, bool transform_1 = true) + { + internal_configure(input0, input1, output, workspace, alpha, beta, transform_0, transform_1); + } + +protected: + virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) = 0; + + const ITensor *_input0; + const ITensor *_input1; + ITensor *_output; + ITensor *_workspace; + float _alpha; + float _beta; + bool _transform_0; + bool _transform_1; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h new file mode 100644 index 0000000000..77431d2bc8 --- /dev/null +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__ +#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ +class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel +{ +public: + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +protected: + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp new file mode 100644 index 0000000000..fa1d6e37a9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +// Macro to use in assembler to get a preload. Needed because of various +// workarounds needed to get working preload behaviour. +// +// Code using these macros needs to clobber x20 and x21 as they might be +// used by the workaround. + +#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" +#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" +#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" +#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" + +#else + +#define ASM_PREFETCH(address) "PLD " address "\n" +#define ASM_PREFETCHW(address) "PLDW " address "\n" + +#endif + +/* + * Do some prefetches. + */ +template +static inline void prefetch_6x(const T *pfp) { + __asm __volatile ( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + ASM_PREFETCH("[%[pfp], #192]") + ASM_PREFETCH("[%[pfp], #256]") + ASM_PREFETCH("[%[pfp], #320]") + : + : [pfp] "r" (pfp) + : "memory" + ); +} + +template +static inline void prefetch_5x(const T *pfp) { + __asm __volatile ( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + ASM_PREFETCH("[%[pfp], #192]") + ASM_PREFETCH("[%[pfp], #256]") + : + : [pfp] "r" (pfp) + : "memory" + ); +} + +template +static inline void prefetch_4x(const T *pfp) { + __asm __volatile ( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + ASM_PREFETCH("[%[pfp], #192]") + : + : [pfp] "r" (pfp) + : "memory" + ); +} + +template +static inline void prefetch_3x(const T *pfp) { + __asm __volatile ( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + : + : [pfp] "r" (pfp) + : "memory" + ); +} + +template +static inline void prefetch_2x(const T *pfp) { + __asm __volatile ( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + : + : [pfp] "r" (pfp) + : "memory" + ); +} + +template +static inline void prefetch_1x(const T *pfp) { + __asm __volatile ( + ASM_PREFETCH("[%[pfp]]") + : + : [pfp] "r" (pfp) + : "memory" + ); +} diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp new file mode 100644 index 0000000000..00974436ff --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +// Abstract class for a GEMM function +template +class GemmCommon { +public: + virtual size_t get_working_size() const = 0; + virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space = NULL) const = 0; + virtual ~GemmCommon() { } +}; diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp new file mode 100644 index 0000000000..a186d88355 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "gemm_common.hpp" +#include "profiler.hpp" +#include "transform.hpp" +#include "mergeresults.hpp" + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This implementation interleaves the source matrices in blocks - good for +// larger matrices. +template +class GemmInterleaved : public GemmCommon { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + const unsigned int M; + const unsigned int N; + const unsigned int K; + + const bool trA; + const bool trB; + + const strategy strat; + + unsigned int k_block = 0; + unsigned int x_block = 0; + unsigned int Mround = 0; + + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * k_block * Mround); + } + + size_t get_b_working_size() const { + return ROUND_UP(sizeof(Toi) * x_block * k_block); + } + + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * x_block * strat.out_height); + } + +public: + size_t get_working_size() const override { + return get_a_working_size() + get_b_working_size() + get_c_working_size(); + } + + GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) { + const unsigned int L1_size = ci->L1_size; + const unsigned int L2_size = ci->L2_size; + + // Work out blocking parameters + // k_block: Each iteration will consume (out_width + out_height) + // operands - so how many iterations will fill the L1? + k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height)); + + // Needs to be a multiple of the K unroll level. + k_block /= strat.k_unroll; + k_block *= strat.k_unroll; + + // Now tune to presented problem size; this is how many blocks we need. + int num_k_blocks = (K + (k_block - 1)) / k_block; + + // So divide the space equally into that many blocks. + k_block = (K + num_k_blocks - 1) / num_k_blocks; + + // And round UP to the K unroll level required. + k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll; + k_block *= strat.k_unroll; + + // x_block: Work out how many rows (of length k_block) will fit in the L2 + x_block = L2_size / (sizeof(Toi) * k_block); + + // Needs to be a multiple of the kernel output width. + x_block /= strat.out_width; + x_block *= strat.out_width; + + // And tune to the presented problem size. + int num_x_blocks = (N + (x_block - 1)) / x_block; + x_block = (N + num_x_blocks - 1) / num_x_blocks; + + x_block = (x_block + strat.out_width - 1) / strat.out_width; + x_block *= strat.out_width; + + // Work out the rounded size of M - needed for some buffers. + Mround = (M + (strat.out_height - 1)) / strat.out_height; + Mround *= strat.out_height; + } + + // Actually execute the GEMM. + void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { + profiler prof; + + int8_t *working_space_bytes = reinterpret_cast(working_space); + intptr_t working_space_int = reinterpret_cast(working_space_bytes); + size_t diff = 0; + + if (working_space_int & 0xF) { + diff = 0x10 - (working_space_int & 0xF); + } + + // TODO: Multithreaded implementations could share the burden of transforming these blocks. + Toi * const a_panel = reinterpret_cast(working_space_bytes + diff); + Toi * const b_panel = reinterpret_cast(working_space_bytes + get_a_working_size() + diff); + Tri * const c_panel = reinterpret_cast(working_space_bytes + get_a_working_size() + get_b_working_size() + diff); + + for (unsigned int k0=0; k0 K) kmax = K; + + // Figure out how many "K" the kernel will actually process. + int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll; + kern_k *= strat.k_unroll; + + prof(PROFILE_PREPA, [&](void) { + if (trA ^ strategy::A_transpose) { + Transform(a_panel, A, lda, 0, M, k0, kmax); + } else { + Transform(a_panel, A, lda, 0, M, k0, kmax); + } + }); + + for (unsigned int x0=0; x0 N) xmax = N; + + int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width; + + prof(PROFILE_PREPB, [&](void) { + if (trB ^ strategy::B_transpose) { + Transform(b_panel, B, ldb, x0, xmax, k0, kmax); + } else { + Transform(b_panel, B, ldb, x0, xmax, k0, kmax); + } + }); + + for (unsigned int y=0; y M) ymax = M; + + prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); + prof(PROFILE_MERGE, [&](void) { MergeResults(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast(1))); }); + } + } + } + } +}; diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp new file mode 100644 index 0000000000..e229e215ef --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Actual kernel implementations +#include "a64_sgemm_12x8/generic.hpp" +#include "a64_sgemm_12x8/a53.hpp" + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class sgemm_12x8 { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel{nullptr}; + + sgemm_12x8(const CPUInfo *ci) { + kernel = a64_sgemm_asimd_12x8; + if (ci->CPU == CPUTarget::A53) { + kernel = a64_sgemm_asimd_12x8_a53; + } + } +}; + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp new file mode 100644 index 0000000000..e58ce66825 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb +void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) { + int full_y_blocks = (ymax - y0) / height; + int y_remainder = (ymax - y0) % height; + int y_blocks = full_y_blocks + (y_remainder ? 1 : 0); + + int full_x_blocks = (xmax - x0) / width; + int x_remainder = (xmax - x0) % width; + int x_blocks = full_x_blocks + (x_remainder ? 1 : 0); + + for (int y_block = 0; y_block < y_blocks; y_block++) { + int ybase = y0 + (y_block * height); + + int fill_rows = (y_block < full_y_blocks) ? height : y_remainder; + + for (int x_block = 0; x_block < x_blocks; x_block++) { + int xbase = x0 + (x_block * width); + + int fill_cols = (x_block < full_x_blocks) ? width : x_remainder; + + for (int row=0; row < fill_rows; row++) { + for (int col=0; col < fill_cols; col++) { + Tout &p = out[(ybase + row) * ldc + xbase + col]; + + p = (p * alpha) + (beta * in[row * width + col]); + } + } + + in += (width * height); + } + } +} + +#include "merges/list.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp new file mode 100644 index 0000000000..f2c5fd86b9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "../asmlib.hpp" + +template<> +inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) { + const float *inptr = in; + prefetch_6x(inptr); + prefetch_6x(inptr + 96); + + float32x4_t av = vdupq_n_f32(alpha); + float32x4_t bv = vdupq_n_f32(beta); + + for (int y=y0; y= ymax) { + switch ((y + 7) - ymax) { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + default: + break; + } + } + + /* For ragged X, manually copy over the valid results. */ + if ((i+11) >= xmax) { + for (int xi=0; xi<12; xi++) { + if ((i+xi) < xmax) { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr0], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr1]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr1], #32]\n" + ASM_PREFETCH("[%[inptr], #768]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr]]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #32]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + ASM_PREFETCH("[%[inptr], #832]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr0]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + ASM_PREFETCH("[%[inptr], #896]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr1]], #16\n" + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr2], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr3]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr3], #32]\n" + ASM_PREFETCH("[%[inptr], #960]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #96]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #128]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + ASM_PREFETCH("[%[inptr], #1024]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr2]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + ASM_PREFETCH("[%[inptr], #1088]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr3]], #16\n" + + // Rows 4-5 + ASM_PREFETCH("[%[outptr0], #80]") + "LDP q16, q17, [%[outptr4]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr4], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr5]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr5], #32]\n" + ASM_PREFETCH("[%[outptr1], #80]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #192]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #224]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + ASM_PREFETCH("[%[outptr2], #80]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr4]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + ASM_PREFETCH("[%[outptr3], #80]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + ASM_PREFETCH("[%[outptr4], #80]") + "LDP q16, q17, [%[outptr6]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr6], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr7]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr7], #32]\n" + ASM_PREFETCH("[%[outptr5], #80]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #288]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #320]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + ASM_PREFETCH("[%[outptr6], #128]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr6]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + ASM_PREFETCH("[%[outptr7], #128]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), + [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [av] "w" (av), [bv] "w" (bv) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", "q18", "q19", "q20", "q21" + ); + } + } + } +} + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp new file mode 100644 index 0000000000..4f23333ef1 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "a64_merge_float_12x8.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp new file mode 100644 index 0000000000..d2f8ba923a --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef CYCLE_PROFILING + +#include "../perf.h" + +class profiler { +private: + static const int maxevents = 10000; + unsigned long times[maxevents]; + int events[maxevents]; + int currentevent; + int countfd; + +public: + profiler() { + currentevent=0; + countfd=open_cycle_counter(); + } + + ~profiler() { + close(countfd); + int tots[5]; + unsigned long counts[5]; + const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" }; + + for (int i=1; i<5; i++) { + tots[i] = 0; + counts[i] = 0; + } + + printf("Profiled events:\n"); + for (int i=0; i + void operator() (int i, T func) { + if (currentevent==maxevents) { + func(); + } else { + start_counter(countfd); + func(); + long long cycs = stop_counter(countfd); + events[currentevent] = i; + times[currentevent++] = cycs; + } + } +}; + +#else + +class profiler { +public: + template + void operator() (int i, T func) { + func(); + } +}; + +#endif + +#define PROFILE_PREPA 1 +#define PROFILE_PREPB 2 +#define PROFILE_KERNEL 3 +#define PROFILE_MERGE 4 diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp new file mode 100644 index 0000000000..717506f54c --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/transform.hpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* + * Generic transform. + * + * Assuming the untransposed case, this works by first reading + * consecutive values from the first input row. This same number of values + * are then read from the next rows. Now return to the first + * input row and repeat. + * + * Need to cope with the work requested in either dimension not actually + * being a multiple of the block sizes. + */ +template +struct TransformImpl { + template + static void Transform(TOut* out, const TIn* const in, const int stride, + const int y0, const int ymax, const int x0, const int xmax) { + const int n_whole_y_blocks = (ymax - y0) / IntBy; + const int y_remainders = (ymax - y0) % IntBy; + const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0); + + const int n_whole_x_blocks = (xmax - x0) / BlockBy; + const int x_remainders = (xmax - x0) % BlockBy; + const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0); + + // "Y" loop: advance down the rows of the source IntBy rows at a time. + // Set up fill_rows to show the number rows to copy from, and blank_rows + // for the number of blank rows to add. + for (int y_block=0 ; y_block < n_y_blocks; y_block++) { + int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders; + int blank_rows = IntBy - fill_rows; + + int y_base = y0 + (y_block * IntBy); + + // So now advance along this block of rows, BlockBy columns at a time. + for (int x_block=0 ; x_block < n_x_blocks; x_block++) { + int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders; + int blank_cols = BlockBy - fill_cols; + + int x_base = x0 + (x_block * BlockBy); + + for (int row = 0; row < fill_rows; row++) { + for (int col = 0; col < fill_cols; col++) { + // In-range copy. If it's transposed, we reverse the sense of rows and columns here. + if (Transposed) { + *out++ = static_cast(in[(x_base + col) * stride + y_base + row]); + } else { + *out++ = static_cast(in[(y_base + row) * stride + x_base + col]); + } + } + // "col" tail - row is in range but column is out of range. + for (int col=0; col < blank_cols; col++) { + *out++ = static_cast(0); + } + } + // "row" tail - row is out of range so fill with zeros always. + for (int row = 0; row < blank_rows; row++) { + for (int col=0; col < (fill_cols + blank_cols); col++) { + *out++ = static_cast(0); + } + } + } + } + } + + template + static inline void Transform(T* out, const T* const in, const int stride, + const int k0, const int kmax, const int x0, const int xmax) { + Transform(out, in, stride, k0, kmax, x0, xmax); + } +}; + +/*****************************************************************************/ +template +void Transform( + TOut* out, const TIn* const in, const int stride, + const int k0, const int kmax, const int x0, const int xmax +) { + // Redirect to a specialised implementation predicated on argument size. + TransformImpl::Transform( + out, in, stride, k0, kmax, x0, xmax + ); +} +/*****************************************************************************/ + +#include "transforms/list.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp new file mode 100644 index 0000000000..6317424598 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "../asmlib.hpp" + +#include + +template<> +template +void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { + uint32_t *outptr = (uint32_t *)out; + const uint32_t *inptr = (uint32_t *)in; + + uint32_t zerobuff[8]; + + for (int y=y0; y7;x-=8) { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if ((y + 7) >= ymax) { + switch ((y + 7) - ymax) { + /* Everything falls through in here */ + case 6: + inptr1 = zerobuff; + case 5: + inptr2 = zerobuff; + case 4: + inptr3 = zerobuff; + case 3: + inptr4 = zerobuff; + case 2: + inptr5 = zerobuff; + case 1: + inptr6 = zerobuff; + case 0: + inptr7 = zerobuff; + default: + break; + } + } + + __asm __volatile ( + // Load up 8 elements (2 vectors) from each of 8 sources. + "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 + "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 + "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 + "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 + ASM_PREFETCH("[%[inptr0], #128]") + "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 + "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 + "LDP q8, q9, [%[inptr4]], #32\n" + "LDP q10, q11, [%[inptr5]], #32\n" + ASM_PREFETCH("[%[inptr1], #128]") + "LDP q12, q13, [%[inptr6]], #32\n" + "ZIP1 v18.4s, v8.4s, v12.4s\n" + "LDP q14, q15, [%[inptr7]], #32\n" + "ZIP1 v19.4s, v10.4s, v14.4s\n" + + ASM_PREFETCH("[%[inptr2], #128]") + "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + ASM_PREFETCH("[%[inptr3], #128]") + + "ZIP2 v16.4s, v0.4s, v4.4s\n" + "ZIP2 v17.4s, v2.4s, v6.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source + + "ZIP2 v18.4s, v8.4s, v12.4s\n" + ASM_PREFETCH("[%[inptr4], #128]") + "ZIP2 v19.4s, v10.4s, v14.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + ASM_PREFETCH("[%[inptr5], #128]") + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP1 v16.4s, v1.4s, v5.4s\n" + "ZIP1 v17.4s, v3.4s, v7.4s\n" + ASM_PREFETCH("[%[inptr6], #128]") + "STP q20, q21, [%[outptr]], #32\n" // Third element + + "ZIP1 v18.4s, v9.4s, v13.4s\n" + "ZIP1 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Fourth element + ASM_PREFETCH("[%[inptr7], #128]") + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP2 v16.4s, v1.4s, v5.4s\n" + "ZIP2 v17.4s, v3.4s, v7.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Fifth element + + "ZIP2 v18.4s, v9.4s, v13.4s\n" + "ZIP2 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Sixth element + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Seventh element + + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Eighth element + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); + } + + for (;x>0;x--) { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } +} + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp new file mode 100644 index 0000000000..3cf6b41ffa --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +//#include "a32_interleave_6way_32bit.hpp" +//#include "a32_transpose_interleave_8way_32bit.hpp" +//#include "a64_interleave_8way_16bit.hpp" +#include "a64_interleave_8way_32bit.hpp" +//#include "a64_interleave_8way_half_to_float.hpp" +//#include "a64_transpose_interleave_12way_16bit.hpp" +//#include "a64_transpose_interleave_12way_half_to_float.hpp" +//#include "a64_transpose_interleave_24way_16bit.hpp" +#include "transpose_interleave_common.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp new file mode 100644 index 0000000000..882da9c831 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +template +struct TransposeInterleaveCommon { + // Override the moveblock_1xY methods to improve performance + static inline void moveblock_1x1(const TIn *&in0, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in0++); + } + } + + static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in0++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in1++); + } + } + + static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in0++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in1++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in2++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in3++); + } + } + + static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { + const auto ldin = stride; + + TOut *outarray = out; + const TIn *inarray = in; + TOut *outptr_base = outarray; + const TIn *inptr_base = inarray + x0 + (k0 * ldin); + int ldout = (kmax - k0) * IntBy; + + int k=(kmax-k0); + for ( ; k>3; k-=4) { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + const TIn *inptr3 = inptr2 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + prefetch_3x(inptr3); + + outptr_base += IntBy * 4; + inptr_base += ldin * 4; + + for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { + moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); + outptr += ldout; + } + } + + if (k) { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + + for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { + switch(k) { + case 3: + moveblock_1x2(inptr, inptr1, outptr); + moveblock_1x1(inptr2, outptr + IntBy * 2); + break; + + case 2: + moveblock_1x2(inptr, inptr1, outptr); + break; + + case 1: + moveblock_1x1(inptr, outptr); + break; + default: + break; + } + + outptr += ldout; + } + } + + // Cope with ragged X cases + const unsigned int overflow = (xmax - x0) % IntBy; + if (overflow) { + const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); + TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; + + for (int k=(kmax-k0); k>0; k--) { + const TIn *inptr = inptr_base; + inptr_base += ldin; + + for (unsigned int x=0; x < IntBy; x++) { + TOut val = (x < overflow) ? static_cast(*inptr++) : static_cast(0); + *outptr++ = val; + } + } + } +} +}; diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h index 6078abd06b..8918843c98 100644 --- a/arm_compute/runtime/IScheduler.h +++ b/arm_compute/runtime/IScheduler.h @@ -35,23 +35,23 @@ class IScheduler { public: /** Default constructor. */ - IScheduler() - : _target(CPUTarget::INTRINSICS) - { - } + IScheduler(); /** Destructor. */ virtual ~IScheduler() = default; + /** Sets the number of threads the scheduler will use to run the kernels. * * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified. */ virtual void set_num_threads(unsigned int num_threads) = 0; + /** Returns the number of threads that the SingleThreadScheduler has in his pool. * * @return Number of threads available in SingleThreadScheduler. */ virtual unsigned int num_threads() const = 0; + /** Runs the kernel in the same thread as the caller synchronously. * * @param[in] kernel Kernel to execute. @@ -65,24 +65,14 @@ public: */ void set_target(CPUTarget target); - /** Return the current CPU target. + /** Get CPU info. * - * @return Target CPU. + * @return CPU info. */ - CPUTarget target() const; + CPUInfo cpu_info() const; protected: - CPUTarget _target; + CPUInfo _info{}; }; - -inline void IScheduler::set_target(CPUTarget target) -{ - _target = target; -} - -inline CPUTarget IScheduler::target() const -{ - return _target; -} } #endif /* __ARM_COMPUTE_ISCHEDULER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index 8e040b3055..893dfa0f9d 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -28,6 +28,7 @@ #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" @@ -37,6 +38,8 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include + namespace arm_compute { class ITensor; @@ -59,6 +62,7 @@ public: * Data types supported: Same as @p weights. */ void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW); + // Inherited methods overridden: void run() override; @@ -82,6 +86,7 @@ class NEConvolutionLayer : public IFunction public: /** Constructor */ NEConvolutionLayer(std::shared_ptr memory_manager = nullptr); + /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -96,23 +101,26 @@ public: * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. */ void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo()); + // Inherited methods overridden: void run() override; private: - MemoryGroup _memory_group; - NEIm2ColKernel _input_im2col_kernel; - NEGEMMInterleave4x4Kernel _input_interleave_kernel; - NEConvolutionLayerReshapeWeights _reshape_weights; - NEGEMMMatrixMultiplyKernel _mm_kernel; - NECol2ImKernel _output_col2im_kernel; - Tensor _input_im2col_reshaped; - Tensor _input_interleaved_reshaped; - Tensor _weights_reshaped; - Tensor _gemm_output; - bool _has_bias; - bool _is_fully_connected_convolution; - bool _are_weights_reshaped; + MemoryGroup _memory_group; + NEIm2ColKernel _input_im2col_kernel; + NEGEMMInterleave4x4Kernel _input_interleave_kernel; + NEConvolutionLayerReshapeWeights _reshape_weights; + NEGEMMMatrixMultiplyKernel _mm_kernel; + std::unique_ptr _mm_optimised_kernel; + NECol2ImKernel _output_col2im_kernel; + Tensor _input_im2col_reshaped; + Tensor _input_interleaved_reshaped; + Tensor _weights_reshaped; + Tensor _gemm_output; + Tensor _workspace; + bool _has_bias; + bool _is_fully_connected_convolution; + bool _are_weights_reshaped; }; } #endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index b4b9e8be01..068e7c5ce8 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_NEGEMM_H__ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" @@ -51,6 +52,7 @@ class NEGEMM : public IFunction public: /** Constructor */ NEGEMM(std::shared_ptr memory_manager = nullptr); + /** Initialise the kernel's inputs, output * * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. @@ -69,15 +71,17 @@ public: void run() override; private: - MemoryGroup _memory_group; - NEGEMMInterleave4x4Kernel _interleave_kernel; - NEGEMMTranspose1xWKernel _transpose_kernel; - NEGEMMMatrixMultiplyKernel _mm_kernel; - NEGEMMMatrixAdditionKernel _ma_kernel; - Tensor _tmp_a; - Tensor _tmp_b; - bool _run_vector_matrix_multiplication; - bool _run_addition; + MemoryGroup _memory_group; + NEGEMMInterleave4x4Kernel _interleave_kernel; + NEGEMMTranspose1xWKernel _transpose_kernel; + NEGEMMMatrixMultiplyKernel _mm_kernel; + std::unique_ptr _mm_optimised_kernel; + NEGEMMMatrixAdditionKernel _ma_kernel; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _workspace; + bool _run_vector_matrix_multiplication; + bool _run_addition; }; } #endif /*__ARM_COMPUTE_NEGEMM_H__ */ -- cgit v1.2.1