From 778b95cb755880ab4adc972fbf3b7022a99b63f9 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 20 Apr 2021 12:15:52 +0100 Subject: Update assembly code This patch brings performance uplift on Cortex-A35. Resolves: COMPMID-4316 Change-Id: I2b9c02a599373f780dd1b981b821e33bd59a3422 Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5461 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/core/NEON/kernels/arm_gemm/asmlib.hpp | 1 + src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 12 +- .../kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp | 75 ++++ .../arm_gemm/kernels/a64_sgemm_8x6/generic.cpp | 381 +++++++++++++++++++++ src/core/NEON/kernels/arm_gemm/mergeresults.cpp | 9 +- 5 files changed, 474 insertions(+), 4 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp index 00674334a9..7766656adb 100644 --- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp +++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp @@ -32,6 +32,7 @@ // "Correct" version #define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" +#define ASM_PREFETCHU(address) "PRFUM PLDL1KEEP, " address "\n" #define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" #define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 6c3743dce7..d94814fb4c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,6 +35,7 @@ #include "kernels/a64_hybrid_fp32_mla_6x16.hpp" #include "kernels/a64_hybrid_fp32_mla_8x4.hpp" #include "kernels/a64_sgemm_8x12.hpp" +#include "kernels/a64_sgemm_8x6.hpp" #include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp" #include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp" @@ -110,7 +111,14 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, #endif // __ARM_FEATURE_SVE - +// Cortex-A35 specific kernel - use for any problem on A35, and never in any other cases. +{ + GemmMethod::GEMM_INTERLEAVED, + "a64_sgemm_8x6", + nullptr, + [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A35; }, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, // Arm® Neon™ hybrid methods { GemmMethod::GEMM_HYBRID, diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp new file mode 100644 index 0000000000..c1318a2a06 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_sgemm_asimd_8x6(const float *, const float *, float *, int, int, int); + +// 8x6 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class cls_a64_sgemm_8x6 { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() { + return 6; + } + + static unsigned int out_height() { + return 8; + } + + static unsigned int k_unroll() { + return 1; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + + kern_type kernel=a64_sgemm_asimd_8x6; + + cls_a64_sgemm_8x6(const CPUInfo *) { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp new file mode 100644 index 0000000000..9b81374d2d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_sgemm_asimd_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb -#include #include @@ -97,6 +96,12 @@ void MergeResults(Tout * out, const Tin * in, int ldc, int y0, int ymax, int x0, #include "merges/list.hpp" +/* Cortex-A53 8x6 SGEMM kernel uses a templated merge as the optimized merge + * generator cannot cope with the width (6) not being a multiple of VL (4). */ +#ifdef __aarch64__ +template void MergeResults<6u, 8u, false, float, float>(float *, float const*, int, int, int, int, int, float const *, Activation, bool); +#endif + #if defined(__aarch64__) && defined(__ARM_FP16_ARGS) template void MergeResults<12u, 8u, false, float, __fp16>(__fp16*, float const*, int, int, int, int, int, __fp16 const*, Activation, bool); #endif -- cgit v1.2.1