From 5b6904b8d9cb5e8a343cde96fd5a8701f44dff90 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 29 Jan 2018 12:24:14 +0000 Subject: COMPMID-866: Integrate SGEMV Neon Assembly from RSH Change-Id: Icbb43de7642e2b433d7471d70b9dbbde850989d3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118197 Tested-by: Jenkins Reviewed-by: Pablo Tello --- arm_compute/core/NEON/NEKernels.h | 3 +- .../core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h | 48 ++ .../core/NEON/kernels/assembly/gemv_transposed.hpp | 101 +++ .../kernels/assembly/kernels/a64_sgemv_trans.hpp | 50 ++ .../core/NEON/kernels/assembly/kernels/generic.hpp | 913 +++++++++++++++++++++ .../NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp | 130 +++ src/runtime/NEON/functions/NEGEMM.cpp | 41 +- 7 files changed, 1283 insertions(+), 3 deletions(-) create mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h create mode 100644 arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp create mode 100644 src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index d5c4c340ee..6954293951 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2018 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -117,6 +117,7 @@ #include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h" #include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" #include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h" #include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" #endif /* __ARM_COMPUTE_NEKERNELS_H__ */ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h new file mode 100644 index 0000000000..9fb3ce415a --- /dev/null +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__ +#define __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** AArch64 NEON kernel to multiply an input vector "A" and a matrix "B". */ +class NEGEMVAArch64Kernel : public NEGEMMAssemblyBaseKernel +{ +public: + const char *name() const override + { + return "NEGEMVAArch64Kernel"; + } + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +protected: + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp new file mode 100644 index 0000000000..098fdaa7ac --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "gemm_common.hpp" + +#include "profiler.hpp" +#include "transform.hpp" +#include "mergeresults.hpp" + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This is implementation is for GEMV with a transposed matrix. +// +// By default the source data is used in-place, but if type conversion is +// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED). + +template +class GemvTransposed : public GemmCommon { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + const unsigned int N; + const unsigned int K; + + const strategy strat; + + unsigned int m_block; + unsigned int n_block; + + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * m_block); + } + + size_t get_b_working_size() const { + return ROUND_UP(sizeof(Toi) * m_block * n_block); + } + + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * n_block); + } + +public: + size_t get_working_size() const override { + return get_a_working_size() + get_b_working_size() + get_c_working_size(); + } + + GemvTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K) : N(N), K(K), strat(ci) { + /* For now don't do any blocking. TODO: figure out if we should. */ + m_block = K; + n_block = N; + } + + // Actually execute the GEMV. + void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { + profiler prof; + + static_assert(std::is_same::value, "gemv_transposed: Operand types must be the same."); + static_assert(std::is_same::value, "gemv_transposed: Result types must be the same."); + + for (unsigned int m0=0; m0 K) mmax = K; + + for (unsigned int n0=0; n0 N) nmax = N; + + prof(PROFILE_KERNEL, ((mmax-m0) * (nmax-n0)), [&](void) { strat.kernel(B + (m0 * ldb) + n0, A + m0, C + n0, alpha, ldb, (mmax-m0), (nmax-n0)); }); + } + } + } +}; diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp new file mode 100644 index 0000000000..2a39ca1f07 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Actual kernel implementations +#include "generic.hpp" + +// Transposed SGEMV strategy class. +class sgemv_trans { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int); + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int k_unroll = 1; + + kern_type kernel; + + sgemv_trans(const CPUInfo *ci) { + kernel = a64_sgemv_trans; + } +}; + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp new file mode 100644 index 0000000000..33f2b701cf --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp @@ -0,0 +1,913 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include +#include "asmlib.hpp" + +// Kernel implementation - transposed GEMV +// +// The kernel will process "M" rows of A (= steps of dot product) and "N" +// columns (= dot products total) +// +// General plan is to do as many columns simultaneously as possible - a +// reasonable limit is half the NEON regfile = 64 total accumulators. +// +// It's possible that messing around with sub-blocking M and N can yield +// higher performance, but that's left to the outer loop. In this kernel we +// process all of M at the same time. + + +// How far ahead to prefetch for the first and subsequent prefetches. +// These values work for A72 on JunoR2... + +#define FIRST_PFD 9 +#define PFD 6 + +inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) { + const float *a_ptr_base = Astart; + float *y_ptr = Ystart; + + register const float32x4_t va asm("v1") = vdupq_n_f32(alpha); + + int firstpfd=FIRST_PFD; + if (firstpfd > M) { + firstpfd = (M-1); + } + + int pfd = PFD; + if (pfd > M) { + pfd = (M-1); + } + + ptrdiff_t jump = lda * sizeof(int); + + for (;N>=96;N-=96) { + int k = M-1; + + const float *a_ptr = a_ptr_base; + const float *x_ptr = Xstart; + const float *pf_ptr = a_ptr; + const float *firstpf_ptr = a_ptr; + const float *pf_limit = a_ptr + (M * lda); + + for (int i=0; i0) { + // Handle N tail - up to 95 stragglers. + // This is 0-23 vectors, plus optionally an 64-bit vector and/or a + // single value for the remainder. + + // Independent pointers into the matrix for the odd 2 and odd 1. + // Double up as flag to indicate whether they are needed. + const float *odd2_aptr=NULL; + const float *odd1_aptr=NULL; + + // Figure out how much work we need to do. + int numvecs = N/4; + int rem = N%4; + int k=M; + + // Set up pointers for the odd 2/1 if needed. + if (rem >= 2) { + odd2_aptr = a_ptr_base + (numvecs * 4); + } + + if (rem & 1) { + odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2); + } + + const float *a_ptr = a_ptr_base; + const float *firstpf_ptr = a_ptr_base; + const float *pf_ptr = a_ptr_base; + const float *pf_limit = a_ptr + (M * lda); + + const float *x_ptr = Xstart; + int vecs=0; // Working variable to count how many vectors to work on. + int dopf=1; // Track whether we are doing prefetches. + + // Figure out how many cache lines we need to prefetch each time. + int numpfs = (N + 15) / 16; + + // Do initial prefetches + for (int i=0; i 1) { + for (int i=0; i +#include +#include +#include + +namespace arm_compute +{ +void NEGEMVAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; + + // Configure kernel window + Window win = calculate_max_window(*output->info()); + + AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8); + + const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8); + const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12); + + update_window_and_padding(win, + AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), + AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), + output_access); + + INEKernel::configure(win); +} + +void NEGEMVAArch64Kernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const int lda = _input0->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type); + const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type); + const int ldc = _output->info()->strides_in_bytes().y() / sizeof(sgemv_trans::result_type); + + const auto in1_ptr = reinterpret_cast(_input1->buffer()); + + const int N = _output->info()->tensor_shape().x(); + const int K = _input0->info()->tensor_shape().x(); + + // Only iterate over batches + Window win(window); + win.set(0, Window::Dimension(0, 1, 1)); + win.set(1, Window::Dimension(0, 1, 1)); + + Iterator in0(_input0, window); + Iterator out(_output, window); + + GemvTransposed gemm(&info.cpu_info, N, K); + constexpr size_t alignment = 4096; + const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; + void *workspace = _workspace->buffer() + offset; + size_t workspace_size = _workspace->info()->total_size(); + + if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr) + { + ARM_COMPUTE_ERROR("Not enough space to align buffer!"); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + gemm.execute(reinterpret_cast(in0.ptr()), lda, + reinterpret_cast(in1_ptr), ldb, + reinterpret_cast(out.ptr()), ldc, + _alpha, _beta, workspace); + }, + in0, out); +} +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 29424f5d33..48a0d2af1c 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h" #include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" @@ -40,10 +41,13 @@ namespace arm_compute { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-default" +#pragma GCC diagnostic ignored "-Weffc++" #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" +#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp" #include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp" #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp" #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp" +#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp" #pragma GCC diagnostic pop } // namespace arm_compute @@ -83,8 +87,41 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // If so, all the kernels for reshaping the tensors can be skipped if(_run_vector_matrix_multiplication) { - // Configure the matrix multiply kernel - _mm_kernel.configure(a, b, d, alpha); +#if defined(__aarch64__) + if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)) + { + _mm_optimised_kernel = support::cpp14::make_unique(); + } + + if(_mm_optimised_kernel != nullptr) + { + struct CPUInfo ci = NEScheduler::get().cpu_info(); + + const int N = d->info()->tensor_shape().x(); + const int K = a->info()->tensor_shape().x(); + + size_t workbench_size = 0; + + if(a->info()->data_type() == DataType::F32) + { + workbench_size = GemvTransposed(&ci, N, K).get_working_size(); + } + + constexpr size_t alignment = 4096; + ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0"); + _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8)); + _memory_group.manage(&_workspace); + + // Configure matrix multiplication kernel + _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */); + _workspace.allocator()->allocate(); + } + else +#endif /* defined(__aarch64__) */ + { + // Configure the matrix multiply kernel + _mm_kernel.configure(a, b, d, alpha); + } // Configure matrix addition kernel if(beta != 0 && c != nullptr) -- cgit v1.2.1