From 5b6904b8d9cb5e8a343cde96fd5a8701f44dff90 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 29 Jan 2018 12:24:14 +0000 Subject: COMPMID-866: Integrate SGEMV Neon Assembly from RSH Change-Id: Icbb43de7642e2b433d7471d70b9dbbde850989d3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118197 Tested-by: Jenkins Reviewed-by: Pablo Tello --- .../core/NEON/kernels/assembly/gemv_transposed.hpp | 101 +++ .../kernels/assembly/kernels/a64_sgemv_trans.hpp | 50 ++ .../core/NEON/kernels/assembly/kernels/generic.hpp | 913 +++++++++++++++++++++ 3 files changed, 1064 insertions(+) create mode 100644 arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp (limited to 'arm_compute/core/NEON/kernels/assembly') diff --git a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp new file mode 100644 index 0000000000..098fdaa7ac --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "gemm_common.hpp" + +#include "profiler.hpp" +#include "transform.hpp" +#include "mergeresults.hpp" + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This is implementation is for GEMV with a transposed matrix. +// +// By default the source data is used in-place, but if type conversion is +// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED). + +template +class GemvTransposed : public GemmCommon { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + const unsigned int N; + const unsigned int K; + + const strategy strat; + + unsigned int m_block; + unsigned int n_block; + + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * m_block); + } + + size_t get_b_working_size() const { + return ROUND_UP(sizeof(Toi) * m_block * n_block); + } + + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * n_block); + } + +public: + size_t get_working_size() const override { + return get_a_working_size() + get_b_working_size() + get_c_working_size(); + } + + GemvTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K) : N(N), K(K), strat(ci) { + /* For now don't do any blocking. TODO: figure out if we should. */ + m_block = K; + n_block = N; + } + + // Actually execute the GEMV. + void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { + profiler prof; + + static_assert(std::is_same::value, "gemv_transposed: Operand types must be the same."); + static_assert(std::is_same::value, "gemv_transposed: Result types must be the same."); + + for (unsigned int m0=0; m0 K) mmax = K; + + for (unsigned int n0=0; n0 N) nmax = N; + + prof(PROFILE_KERNEL, ((mmax-m0) * (nmax-n0)), [&](void) { strat.kernel(B + (m0 * ldb) + n0, A + m0, C + n0, alpha, ldb, (mmax-m0), (nmax-n0)); }); + } + } + } +}; diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp new file mode 100644 index 0000000000..2a39ca1f07 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Actual kernel implementations +#include "generic.hpp" + +// Transposed SGEMV strategy class. +class sgemv_trans { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int); + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int k_unroll = 1; + + kern_type kernel; + + sgemv_trans(const CPUInfo *ci) { + kernel = a64_sgemv_trans; + } +}; + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp new file mode 100644 index 0000000000..33f2b701cf --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp @@ -0,0 +1,913 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include +#include "asmlib.hpp" + +// Kernel implementation - transposed GEMV +// +// The kernel will process "M" rows of A (= steps of dot product) and "N" +// columns (= dot products total) +// +// General plan is to do as many columns simultaneously as possible - a +// reasonable limit is half the NEON regfile = 64 total accumulators. +// +// It's possible that messing around with sub-blocking M and N can yield +// higher performance, but that's left to the outer loop. In this kernel we +// process all of M at the same time. + + +// How far ahead to prefetch for the first and subsequent prefetches. +// These values work for A72 on JunoR2... + +#define FIRST_PFD 9 +#define PFD 6 + +inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) { + const float *a_ptr_base = Astart; + float *y_ptr = Ystart; + + register const float32x4_t va asm("v1") = vdupq_n_f32(alpha); + + int firstpfd=FIRST_PFD; + if (firstpfd > M) { + firstpfd = (M-1); + } + + int pfd = PFD; + if (pfd > M) { + pfd = (M-1); + } + + ptrdiff_t jump = lda * sizeof(int); + + for (;N>=96;N-=96) { + int k = M-1; + + const float *a_ptr = a_ptr_base; + const float *x_ptr = Xstart; + const float *pf_ptr = a_ptr; + const float *firstpf_ptr = a_ptr; + const float *pf_limit = a_ptr + (M * lda); + + for (int i=0; i0) { + // Handle N tail - up to 95 stragglers. + // This is 0-23 vectors, plus optionally an 64-bit vector and/or a + // single value for the remainder. + + // Independent pointers into the matrix for the odd 2 and odd 1. + // Double up as flag to indicate whether they are needed. + const float *odd2_aptr=NULL; + const float *odd1_aptr=NULL; + + // Figure out how much work we need to do. + int numvecs = N/4; + int rem = N%4; + int k=M; + + // Set up pointers for the odd 2/1 if needed. + if (rem >= 2) { + odd2_aptr = a_ptr_base + (numvecs * 4); + } + + if (rem & 1) { + odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2); + } + + const float *a_ptr = a_ptr_base; + const float *firstpf_ptr = a_ptr_base; + const float *pf_ptr = a_ptr_base; + const float *pf_limit = a_ptr + (M * lda); + + const float *x_ptr = Xstart; + int vecs=0; // Working variable to count how many vectors to work on. + int dopf=1; // Track whether we are doing prefetches. + + // Figure out how many cache lines we need to prefetch each time. + int numpfs = (N + 15) / 16; + + // Do initial prefetches + for (int i=0; i 1) { + for (int i=0; i