From 94672fb2af6535adc6ea7fe8b8498580ad8cf3f4 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 22 Jan 2020 18:36:27 +0000 Subject: COMPMID-3003: Integrate assembly kernels utilizing MMLA instruction. MMLA is a matrix-multiply instruction introduced on armv8.6-A Signed-off-by: Georgios Pinitas Change-Id: I572a54981d48f5a1e0e9e51102cb7ae28ad87806 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2663 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- Android.bp | 4 + SConstruct | 19 +- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 2 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 20 + src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 20 + .../kernels/a64_interleaved_s8s32_mmla_12x8.hpp | 72 +++ .../a64_interleaved_s8s32_mmla_12x8/generic.cpp | 393 ++++++++++++++ .../kernels/a64_interleaved_u8u32_mmla_12x8.hpp | 72 +++ .../a64_interleaved_u8u32_mmla_12x8/generic.cpp | 393 ++++++++++++++ .../kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp | 72 +++ .../sve_interleaved_s8s32_mmla_3VLx8/generic.cpp | 395 ++++++++++++++ .../kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp | 72 +++ .../sve_interleaved_u8u32_mmla_3VLx8/generic.cpp | 395 ++++++++++++++ .../arm_gemm/merges/a64_merge_fp16_24x8.hpp | 45 ++ .../a64_transpose_interleave_8way_32bit.hpp | 147 +++++ src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 4 + .../sve_interleave_8way_block2_16bit.hpp | 596 +++++++++++++++++++++ .../sve_interleave_8way_block4_16bit.hpp | 596 +++++++++++++++++++++ .../transforms/sve_interleave_8way_block8_8bit.hpp | 596 +++++++++++++++++++++ 19 files changed, 3903 insertions(+), 10 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp diff --git a/Android.bp b/Android.bp index 8ebe4c52d1..e7ad65187f 100644 --- a/Android.bp +++ b/Android.bp @@ -764,6 +764,8 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp", @@ -788,7 +790,9 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp", diff --git a/SConstruct b/SConstruct index 3236e7ca30..0b491847df 100644 --- a/SConstruct +++ b/SConstruct @@ -41,7 +41,8 @@ vars.AddVariables( BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False), BoolVariable("logging", "Logging (this flag is forced to 1 for debug=1)", False), EnumVariable("arch", "Target Architecture", "armv7a", - allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "x86_32", "x86_64", "armv8a", "armv8.2-a", "armv8.2-a-sve", "x86")), + allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "x86_32", "x86_64", + "armv8a", "armv8.2-a", "armv8.2-a-sve", "armv8.6-a", "x86")), EnumVariable("estate", "Execution State", "auto", allowed_values=("auto", "32", "64")), EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")), EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile", "embed_only")), @@ -194,17 +195,17 @@ if 'v7a' in env['arch']: env.Append(CXXFLAGS = ['-mfloat-abi=softfp']) else: env.Append(CXXFLAGS = ['-mfloat-abi=hard']) -elif 'v8a' in env['arch']: - env.Append(CXXFLAGS = ['-march=armv8-a']) - if env['estate'] == '32': - env.Append(CXXFLAGS = ['-mfpu=neon-fp-armv8']) -elif 'v8.2-a' in env['arch']: - if env['estate'] == '32': - env.Append(CXXFLAGS = ['-mfpu=neon-fp-armv8']) +elif 'v8' in env['arch']: if 'sve' in env['arch']: env.Append(CXXFLAGS = ['-march=armv8.2-a+sve+fp16+dotprod']) - else: + elif 'v8.2-a' in env['arch']: env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined + else: + env.Append(CXXFLAGS = ['-march=armv8-a']) + + if 'v8.6-a' in env['arch']: + env.Append(CXXFLAGS = ['-DV8P6']) + elif 'x86' in env['arch']: if env['estate'] == '32': env.Append(CCFLAGS = ['-m32']) diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 7f171ec15a..35493a609c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -114,7 +114,7 @@ static const GemmImplementation gemm_fp32_methods[] = }, { GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mla_16x4", + "hybrid_fp32_mla_16x4_normal", [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; }, [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); }, [](const GemmArgs &args) { return new GemmHybrid(args); } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index a3446b9ddc..f7d8f65aea 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -34,10 +34,12 @@ #include "kernels/a64_gemm_s8_12x8.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" #include "kernels/a64_hybrid_s8s32_dot_16x4.hpp" +#include "kernels/a64_interleaved_s8s32_mmla_12x8.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp" #include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp" #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp" +#include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp" #include "kernels/sve_native_s8s32_dot_4VLx4.hpp" #include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp" @@ -45,6 +47,15 @@ namespace arm_gemm { static const GemmImplementation gemm_s8_methods[] = { #ifdef __ARM_FEATURE_SVE +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_s8s32_mmla_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_s8s32_dot_1VLx8", @@ -74,6 +85,15 @@ static const GemmImplementation gemm_s8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_s8s32_mmla_12x8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_s8s32_dot_4x8", diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index aead814d7e..430d35e06d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -34,10 +34,12 @@ #include "kernels/a64_gemm_u8_12x8.hpp" #include "kernels/a64_gemm_u8_4x4.hpp" #include "kernels/a64_hybrid_u8u32_dot_16x4.hpp" +#include "kernels/a64_interleaved_u8u32_mmla_12x8.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp" #include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp" #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp" +#include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp" #include "kernels/sve_native_u8u32_dot_4VLx4.hpp" #include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp" @@ -45,6 +47,15 @@ namespace arm_gemm { static const GemmImplementation gemm_u8_methods[] = { #ifdef __ARM_FEATURE_SVE +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_u8u32_mmla_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_u8u32_dot_1VLx8", @@ -74,6 +85,15 @@ static const GemmImplementation gemm_u8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_u8u32_mmla_12x8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_u8u32_dot_4x8", diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp new file mode 100644 index 0000000000..f669b870c6 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class interleaved_s8s32_mmla_12x8 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return 12; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + + kern_type kernel=a64_interleaved_s8s32_mmla_12x8; + + interleaved_s8s32_mmla_12x8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp new file mode 100644 index 0000000000..49dbdb866e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class interleaved_u8u32_mmla_12x8 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return 12; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + + kern_type kernel=a64_interleaved_u8u32_mmla_12x8; + + interleaved_u8u32_mmla_12x8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp new file mode 100644 index 0000000000..e182a425f4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class interleaved_s8s32_mmla_3VLx8 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8; + + interleaved_s8s32_mmla_3VLx8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp new file mode 100644 index 0000000000..d636c9d2a4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class interleaved_u8u32_mmla_3VLx8 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8; + + interleaved_u8u32_mmla_3VLx8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp new file mode 100644 index 0000000000..15cc8fb897 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -214,6 +217,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -311,6 +317,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -430,6 +439,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -572,6 +584,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -737,6 +752,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -926,6 +944,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1133,6 +1154,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1184,6 +1208,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1255,6 +1282,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1346,6 +1376,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1456,6 +1489,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1586,6 +1622,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1736,6 +1775,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1907,6 +1949,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp new file mode 100644 index 0000000000..0080c91b18 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "transpose_interleave_common.hpp" + +// Generic unblocked transposed 8x32-bit sized specialisation +template <> +template +inline void TransformImpl<8, 1, true, 4, 4, false>::Transform( + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 16 x uint16_t specialisation + TransformImpl<16, 1, true, 2, 2, false>::Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride*2, x0*2, xmax*2, k0, kmax + ); +} + +// Generic 16x16-bit sized specialisation +template <> +template +inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride, x0, xmax, k0, kmax + ); +} + +// Specialised 16 x uint16_t version +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *const out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR q1, [%[in0], #0x10]\n" + "STR q1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x20\n" + ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r" (in0) + : [out] "r" (out) + : "v0", "v1", "memory" + ); +} + +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *const out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR q1, [%[in0], #0x10]\n" + "STR q1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x20\n" + ASM_PREFETCH("[%[in0], #192]") + + "LDR q2, [%[in1]]\n" + "STR q2, [%[out], #0x20]\n" + "LDR q3, [%[in1], #0x10]\n" + "STR q3, [%[out], #0x30]\n" + "ADD %x[in1], %x[in1], #0x20\n" + ASM_PREFETCH("[%[in1], #192]") + : [in0] "+r" (in0), + [in1] "+r" (in1) + : [out] "r" (out) + : "v0", "v1", "v2", "v3", "memory" + ); +} + +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *const out) { + __asm __volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR q1, [%[in0], #0x10]\n" + "STR q1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x20\n" + ASM_PREFETCH("[%[in0], #192]") + + "LDR q2, [%[in1]]\n" + "STR q2, [%[out], #0x20]\n" + "LDR q3, [%[in1], #0x10]\n" + "STR q3, [%[out], #0x30]\n" + "ADD %x[in1], %x[in1], #0x20\n" + ASM_PREFETCH("[%[in1], #192]") + + "LDR q0, [%[in2]]\n" + "STR q0, [%[out], #0x40]\n" + "LDR q1, [%[in2], #0x10]\n" + "STR q1, [%[out], #0x50]\n" + "ADD %x[in2], %x[in2], #0x20\n" + ASM_PREFETCH("[%[in2], #192]") + + "LDR q2, [%[in3]]\n" + "STR q2, [%[out], #0x60]\n" + "LDR q3, [%[in3], #0x10]\n" + "STR q3, [%[out], #0x70]\n" + "ADD %x[in3], %x[in3], #0x20\n" + ASM_PREFETCH("[%[in3], #192]") + : [in0] "+r" (in0), + [in1] "+r" (in1), + [in2] "+r" (in2), + [in3] "+r" (in3) + : [out] "r" (out) + : "v0", "v1", "v2", "v3", "memory" + ); +} + +template <> +template <> +inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index c0c2ca19d7..be66cd42ff 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -31,5 +31,9 @@ #include "a64_transpose_interleave_12way_16bit.hpp" #include "a64_transpose_interleave_12way_half_to_float.hpp" #include "a64_transpose_interleave_24way_16bit.hpp" +#include "a64_transpose_interleave_8way_32bit.hpp" #include "sve_interleave_8way_32bit.hpp" +#include "sve_interleave_8way_block2_16bit.hpp" +#include "sve_interleave_8way_block4_16bit.hpp" #include "sve_interleave_8way_block4_8bit.hpp" +#include "sve_interleave_8way_block8_8bit.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp new file mode 100644 index 0000000000..234433a0f1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + +template<> +template +inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint16_t *master_outptr = reinterpret_cast(out); + const uint16_t *inptr = reinterpret_cast(in); + + for (int y=y0; y +template +inline void TransformImpl<8, 4, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint16_t *master_outptr = reinterpret_cast(out); + const uint16_t *inptr = reinterpret_cast(in); + + for (int y=y0; y +template +inline void TransformImpl<8, 8, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint8_t *master_outptr = reinterpret_cast(out); + const uint8_t *inptr = reinterpret_cast(in); + + for (int y=y0; y