From 94672fb2af6535adc6ea7fe8b8498580ad8cf3f4 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 22 Jan 2020 18:36:27 +0000 Subject: COMPMID-3003: Integrate assembly kernels utilizing MMLA instruction. MMLA is a matrix-multiply instruction introduced on armv8.6-A Signed-off-by: Georgios Pinitas Change-Id: I572a54981d48f5a1e0e9e51102cb7ae28ad87806 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2663 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 2 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 20 + src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 20 + .../kernels/a64_interleaved_s8s32_mmla_12x8.hpp | 72 +++ .../a64_interleaved_s8s32_mmla_12x8/generic.cpp | 393 ++++++++++++++ .../kernels/a64_interleaved_u8u32_mmla_12x8.hpp | 72 +++ .../a64_interleaved_u8u32_mmla_12x8/generic.cpp | 393 ++++++++++++++ .../kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp | 72 +++ .../sve_interleaved_s8s32_mmla_3VLx8/generic.cpp | 395 ++++++++++++++ .../kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp | 72 +++ .../sve_interleaved_u8u32_mmla_3VLx8/generic.cpp | 395 ++++++++++++++ .../arm_gemm/merges/a64_merge_fp16_24x8.hpp | 45 ++ .../a64_transpose_interleave_8way_32bit.hpp | 147 +++++ src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 4 + .../sve_interleave_8way_block2_16bit.hpp | 596 +++++++++++++++++++++ .../sve_interleave_8way_block4_16bit.hpp | 596 +++++++++++++++++++++ .../transforms/sve_interleave_8way_block8_8bit.hpp | 596 +++++++++++++++++++++ 17 files changed, 3889 insertions(+), 1 deletion(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp (limited to 'src/core') diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 7f171ec15a..35493a609c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -114,7 +114,7 @@ static const GemmImplementation gemm_fp32_methods[] = }, { GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mla_16x4", + "hybrid_fp32_mla_16x4_normal", [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; }, [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); }, [](const GemmArgs &args) { return new GemmHybrid(args); } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index a3446b9ddc..f7d8f65aea 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -34,10 +34,12 @@ #include "kernels/a64_gemm_s8_12x8.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" #include "kernels/a64_hybrid_s8s32_dot_16x4.hpp" +#include "kernels/a64_interleaved_s8s32_mmla_12x8.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp" #include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp" #include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp" #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp" +#include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp" #include "kernels/sve_native_s8s32_dot_4VLx4.hpp" #include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp" @@ -45,6 +47,15 @@ namespace arm_gemm { static const GemmImplementation gemm_s8_methods[] = { #ifdef __ARM_FEATURE_SVE +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_s8s32_mmla_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_s8s32_dot_1VLx8", @@ -74,6 +85,15 @@ static const GemmImplementation gemm_s8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_s8s32_mmla_12x8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_s8s32_dot_4x8", diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index aead814d7e..430d35e06d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -34,10 +34,12 @@ #include "kernels/a64_gemm_u8_12x8.hpp" #include "kernels/a64_gemm_u8_4x4.hpp" #include "kernels/a64_hybrid_u8u32_dot_16x4.hpp" +#include "kernels/a64_interleaved_u8u32_mmla_12x8.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp" #include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp" #include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp" #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp" +#include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp" #include "kernels/sve_native_u8u32_dot_4VLx4.hpp" #include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp" @@ -45,6 +47,15 @@ namespace arm_gemm { static const GemmImplementation gemm_u8_methods[] = { #ifdef __ARM_FEATURE_SVE +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_u8u32_mmla_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_u8u32_dot_1VLx8", @@ -74,6 +85,15 @@ static const GemmImplementation gemm_u8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif +#ifdef V8P6 +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_u8u32_mmla_12x8", + [](const GemmArgs &args) { return (args._Ksize>8); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_u8u32_dot_4x8", diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp new file mode 100644 index 0000000000..f669b870c6 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class interleaved_s8s32_mmla_12x8 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return 12; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + + kern_type kernel=a64_interleaved_s8s32_mmla_12x8; + + interleaved_s8s32_mmla_12x8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp new file mode 100644 index 0000000000..49dbdb866e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class interleaved_u8u32_mmla_12x8 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return 12; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsFixed transforms = {}; + + kern_type kernel=a64_interleaved_u8u32_mmla_12x8; + + interleaved_u8u32_mmla_12x8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp new file mode 100644 index 0000000000..e182a425f4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class interleaved_s8s32_mmla_3VLx8 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8; + + interleaved_s8s32_mmla_3VLx8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp new file mode 100644 index 0000000000..d636c9d2a4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class interleaved_u8u32_mmla_3VLx8 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 8; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8; + + interleaved_u8u32_mmla_3VLx8(const CPUInfo *ci) + { + UNUSED(ci); + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp new file mode 100644 index 0000000000..15cc8fb897 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + K /= 8; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -214,6 +217,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -311,6 +317,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -430,6 +439,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -572,6 +584,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -737,6 +752,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -926,6 +944,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1133,6 +1154,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1184,6 +1208,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1255,6 +1282,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1346,6 +1376,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1456,6 +1489,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1586,6 +1622,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1736,6 +1775,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1907,6 +1949,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp new file mode 100644 index 0000000000..0080c91b18 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "transpose_interleave_common.hpp" + +// Generic unblocked transposed 8x32-bit sized specialisation +template <> +template +inline void TransformImpl<8, 1, true, 4, 4, false>::Transform( + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 16 x uint16_t specialisation + TransformImpl<16, 1, true, 2, 2, false>::Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride*2, x0*2, xmax*2, k0, kmax + ); +} + +// Generic 16x16-bit sized specialisation +template <> +template +inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride, x0, xmax, k0, kmax + ); +} + +// Specialised 16 x uint16_t version +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *const out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR q1, [%[in0], #0x10]\n" + "STR q1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x20\n" + ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r" (in0) + : [out] "r" (out) + : "v0", "v1", "memory" + ); +} + +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *const out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR q1, [%[in0], #0x10]\n" + "STR q1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x20\n" + ASM_PREFETCH("[%[in0], #192]") + + "LDR q2, [%[in1]]\n" + "STR q2, [%[out], #0x20]\n" + "LDR q3, [%[in1], #0x10]\n" + "STR q3, [%[out], #0x30]\n" + "ADD %x[in1], %x[in1], #0x20\n" + ASM_PREFETCH("[%[in1], #192]") + : [in0] "+r" (in0), + [in1] "+r" (in1) + : [out] "r" (out) + : "v0", "v1", "v2", "v3", "memory" + ); +} + +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *const out) { + __asm __volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR q1, [%[in0], #0x10]\n" + "STR q1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x20\n" + ASM_PREFETCH("[%[in0], #192]") + + "LDR q2, [%[in1]]\n" + "STR q2, [%[out], #0x20]\n" + "LDR q3, [%[in1], #0x10]\n" + "STR q3, [%[out], #0x30]\n" + "ADD %x[in1], %x[in1], #0x20\n" + ASM_PREFETCH("[%[in1], #192]") + + "LDR q0, [%[in2]]\n" + "STR q0, [%[out], #0x40]\n" + "LDR q1, [%[in2], #0x10]\n" + "STR q1, [%[out], #0x50]\n" + "ADD %x[in2], %x[in2], #0x20\n" + ASM_PREFETCH("[%[in2], #192]") + + "LDR q2, [%[in3]]\n" + "STR q2, [%[out], #0x60]\n" + "LDR q3, [%[in3], #0x10]\n" + "STR q3, [%[out], #0x70]\n" + "ADD %x[in3], %x[in3], #0x20\n" + ASM_PREFETCH("[%[in3], #192]") + : [in0] "+r" (in0), + [in1] "+r" (in1), + [in2] "+r" (in2), + [in3] "+r" (in3) + : [out] "r" (out) + : "v0", "v1", "v2", "v3", "memory" + ); +} + +template <> +template <> +inline void TransformImpl<16, 1, true, 2, 2, false>::Transform( + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index c0c2ca19d7..be66cd42ff 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -31,5 +31,9 @@ #include "a64_transpose_interleave_12way_16bit.hpp" #include "a64_transpose_interleave_12way_half_to_float.hpp" #include "a64_transpose_interleave_24way_16bit.hpp" +#include "a64_transpose_interleave_8way_32bit.hpp" #include "sve_interleave_8way_32bit.hpp" +#include "sve_interleave_8way_block2_16bit.hpp" +#include "sve_interleave_8way_block4_16bit.hpp" #include "sve_interleave_8way_block4_8bit.hpp" +#include "sve_interleave_8way_block8_8bit.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp new file mode 100644 index 0000000000..234433a0f1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + +template<> +template +inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint16_t *master_outptr = reinterpret_cast(out); + const uint16_t *inptr = reinterpret_cast(in); + + for (int y=y0; y +template +inline void TransformImpl<8, 4, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint16_t *master_outptr = reinterpret_cast(out); + const uint16_t *inptr = reinterpret_cast(in); + + for (int y=y0; y +template +inline void TransformImpl<8, 8, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint8_t *master_outptr = reinterpret_cast(out); + const uint8_t *inptr = reinterpret_cast(in); + + for (int y=y0; y