diff options
author | Pablo Tello <pablo.tello@arm.com> | 2018-02-23 13:43:50 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:49:16 +0000 |
commit | eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0 (patch) | |
tree | 42cca378eed97c07348f28e1ec708d9c7ed531ce /arm_compute/core/NEON | |
parent | 8df6c452820719d201ee79596cde8445c2071db5 (diff) | |
download | ComputeLibrary-eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0.tar.gz |
COMPMID-881: RSH new arm_gemm interface.
Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/core/NEON')
62 files changed, 600 insertions, 10927 deletions
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 5c15e5ecc4..7ec74eaccd 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -113,13 +113,5 @@ #include "arm_compute/core/NEON/kernels/NEWarpKernel.h" #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" #include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h" -#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" #endif /* __ARM_COMPUTE_NEKERNELS_H__ */ diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h deleted file mode 100644 index 4868f83d74..0000000000 --- a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch32/armv7a NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMAArch32Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMAArch32Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h deleted file mode 100644 index 5252378db7..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMAArch64Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h deleted file mode 100644 index ba78aae9f4..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__ -#define __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Native AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMAArch64NativeKernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMAArch64NativeKernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h deleted file mode 100644 index 83c209d48f..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -// Enable only if compiled for AArch64-V8A targets -#ifdef ARM_COMPUTE_AARCH64_V8A - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMLowpAArch64A53Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpAArch64A53Kernel"; - } - /** Default constructor */ - NEGEMMLowpAArch64A53Kernel(); - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; - -private: - using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, - const Window &window, - const ThreadInfo &info); - NEGEMMLowpAArch64A53 *_func; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8A */ -#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h deleted file mode 100644 index f813242fc9..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -// Enable only if compiled for AArch64-V8A targets -#ifdef ARM_COMPUTE_AARCH64_V8A - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMLowpAArch64Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpAArch64Kernel"; - } - /** Default constructor */ - NEGEMMLowpAArch64Kernel(); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; - -private: - using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1, const Window &window, - const ThreadInfo &info); - NEGEMMLowpAArch64 *_func; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8A */ -#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h deleted file mode 100644 index b854d3a9aa..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -// Enable only if compiled for AArch64-V8.2-A targets -#ifdef ARM_COMPUTE_AARCH64_V8_2 - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpAArch64V8P4Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel - * - * The computed function is C = a * AxB + b * C. - * - * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8 - * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0 - * @param[in] output Output tensor info to store the result of matrix multiplication. - * If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32 - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8_2 */ -#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h deleted file mode 100644 index 9fb3ce415a..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__ -#define __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply an input vector "A" and a matrix "B". */ -class NEGEMVAArch64Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMVAArch64Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h deleted file mode 100644 index 75c4dbdaa4..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__ -#define __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEHGEMMAArch64FP16Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEHGEMMAArch64FP16Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h new file mode 100644 index 0000000000..646cc7861a --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__ +#define __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Utils.h" + +namespace arm_compute +{ +class ITensor; + +/** This class is a wrapper for the assembly kernels. + * + * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55. + * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance + * of NEGEMMAssemblyWrapper and other auxiliary data structures to execute a single assembly kernel + * in the context of an NEFunctions. + * + * The type T is the type of the actual kernel implemented in assembly which is of type + * template<typename To, typename Tr> class GemmCommon + * + * + */ +template<typename T> +class NEGEMMAssemblyWrapper final : public INEKernel +{ +public: + /** Constructor + */ + NEGEMMAssemblyWrapper() : _kernel(nullptr) {} + + NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &) = delete; + NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &&) = default; + NEGEMMAssemblyWrapper & operator=(NEGEMMAssemblyWrapper &) = delete; + + const char *name() const override + { + return "NEGEMMAssemblyWrapper"; + } + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void*>(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + auto first = window.x().start(); + auto last = window.x().end(); + _kernel->execute(first, last, info.thread_id); + } + /** Initialise the kernel's input and output. + * + * @param[in] kernel Pointer to an assembly kernel implementation. + * @param[in] num_threads Number of concurrent threads which will execute the kernel. + */ + void configure(T *kernel) + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void*>(kernel))); + _kernel = kernel; + auto win_last = _kernel->get_window_size(); + Window win; + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + INEKernel::configure(win); + } +private: + T* _kernel; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp index 8ad5b857fb..d6c9931a21 100644 --- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,19 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "a32_interleave_6way_32bit.hpp" -#include "a32_transpose_interleave_8way_32bit.hpp" -#include "a64_block16_interleave4_8bit.hpp" -#include "a64_interleave_8way_16bit.hpp" -#include "a64_interleave_8way_32bit.hpp" -#include "a64_interleave_8way_half_to_float.hpp" -#include "a64_transpose_interleave_12way_16bit.hpp" -#include "a64_transpose_interleave_12way_half_to_float.hpp" -#include "a64_transpose_interleave_24way_16bit.hpp" -#include "transpose_interleave_common.hpp" +#pragma once + +#include <memory> + +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" + +namespace arm_gemm { + +template<typename Top, typename Tret> +using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >; + +template<typename Top, typename Tret> +UniqueGemmCommon<Top, Tret> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB, const Tret alpha, const Tret beta, const int maxthreads, const bool pretransposed_hint); + +} // namespace arm_gemm diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp index 29b915a75d..a608566634 100644 --- a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "a32_merge_float_8x6.hpp" -#include "a64_merge_float_12x8.hpp" -//#include "a64_merge_float_to_half_12x8.hpp" -//#include "a64_merge_half_24x8.hpp" -//#include "a64_merge_int32_12x8.hpp" +#pragma once + +/* This file is used to configure integration-specific aspects of arm_gemm, this is the gemm-linux version */ + +/* Our CPUInfo is defined in newgemm_lib.hpp */ +#include "newgemm_lib.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp deleted file mode 100644 index fa1d6e37a9..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ -// Macro to use in assembler to get a preload. Needed because of various -// workarounds needed to get working preload behaviour. -// -// Code using these macros needs to clobber x20 and x21 as they might be -// used by the workaround. - -#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" -#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" -#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" -#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" - -#else - -#define ASM_PREFETCH(address) "PLD " address "\n" -#define ASM_PREFETCHW(address) "PLDW " address "\n" - -#endif - -/* - * Do some prefetches. - */ -template <typename T> -static inline void prefetch_6x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - ASM_PREFETCH("[%[pfp], #192]") - ASM_PREFETCH("[%[pfp], #256]") - ASM_PREFETCH("[%[pfp], #320]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template <typename T> -static inline void prefetch_5x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - ASM_PREFETCH("[%[pfp], #192]") - ASM_PREFETCH("[%[pfp], #256]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template <typename T> -static inline void prefetch_4x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - ASM_PREFETCH("[%[pfp], #192]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template <typename T> -static inline void prefetch_3x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template <typename T> -static inline void prefetch_2x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template <typename T> -static inline void prefetch_1x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp index ef89e3aac3..7f47abcbb9 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,11 +23,82 @@ */ #pragma once -// Abstract class for a GEMM function +#include <cstddef> + +namespace arm_gemm { + +// Abstract class for the GEMM/GEMV functions. +// +// GEMM implementations may be "native" (never require any input +// permutation), "pretransposed" (require permutation up-front) or require +// working space (permute as they go along). This interface should support +// all of them. + template<typename To, typename Tr> class GemmCommon { +protected: + const To *_Aptr=nullptr; + int _lda=0; + const To *_Bptr=nullptr; + int _ldb=0; + Tr *_Cptr=nullptr; + int _ldc=0; + public: - virtual size_t get_working_size() const = 0; - virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space) const = 0; + /* Pass in the pointers to the arrays to be operated on and their + * strides. This has a default implementation that just captures them + * all in protected members. If B is pretransposed (see below) then the + * settings for B here are ignored. */ + virtual void set_arrays(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc) { + _Aptr = A; + _lda = lda; + _Bptr = B; + _ldb = ldb; + _Cptr = C; + _ldc = ldc; + } + + /* For threading, we divide the work into some number of units and work + * out internally what unit corresponds to what work. This returns the + * total number of units. */ + virtual unsigned int get_window_size() const = 0; + + /* The maximum thread count is specified when the GEMM is created. Some + * implementations need to know how many threads will actually run in + * order to work properly. + * + * In some cases, after creating the GEMM the number of threads needs to + * be reduced (e.g. not enough work to split across threads). This + * method allows the number of actual threads to be run to be set (must + * be equal or lower). + * + * This has an empty default implementation, as GEMMs which don't care + * about thread count can safely ignore this. + */ + virtual void set_nthreads(int nthreads) { }; + + /* Actually do the work. Provide a threadid to index any per-thread + * buffers, and a start/end range to indicate which work to do. */ + virtual void execute(unsigned int start, unsigned int end, int threadid) = 0; + + /*** Working space interface (optional) ***/ + /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ + virtual size_t get_working_size() const { return 0; } + /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ + virtual void set_working_space(void *) { }; + + /*** "Pretransposed" interface (optional) ***/ + /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ + virtual bool B_is_pretransposed() const { return false; } + /* Does pretranspose still need to be done? */ + virtual bool B_pretranspose_required() const { return false; } + /* Total number of bytes of space needed for pretransposed arrays. */ + virtual size_t get_B_pretransposed_array_size() const { return 0; } + /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ + virtual void pretranspose_B_array(void *buffer, const To *, const int) { }; + + // Destructor virtual ~GemmCommon() { } }; + +} // namespace arm_gemm diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp deleted file mode 100644 index 659ef837f5..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include <stdio.h> -#include <cassert> - -#include "gemm_common.hpp" -#include "profiler.hpp" -#include "transform.hpp" -#include "mergeresults.hpp" - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This implementation interleaves the source matrices in blocks - good for -// larger matrices. -template<typename strategy, typename To, typename Tr> -class GemmInterleaved : public GemmCommon<To, Tr> { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - const unsigned int M; - const unsigned int N; - const unsigned int K; - - const bool trA; - const bool trB; - - const strategy strat; - - unsigned int k_block = 0; - unsigned int x_block = 0; - unsigned int Mround = 0; - - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * k_block * Mround); - } - - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * x_block * k_block); - } - - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * x_block * strat.out_height); - } - -public: - size_t get_working_size() const override { - return get_a_working_size() + get_b_working_size() + get_c_working_size(); - } - - GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) { - const unsigned int L1_size = ci->L1_size; - const unsigned int L2_size = ci->L2_size; - - // Work out blocking parameters - // k_block: Each iteration will consume (out_width + out_height) - // operands - so how many iterations will fill the L1? - k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height)); - - // Needs to be a multiple of the K unroll level. - k_block /= strat.k_unroll; - k_block *= strat.k_unroll; - - // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = (K + (k_block - 1)) / k_block; - - // So divide the space equally into that many blocks. - k_block = (K + num_k_blocks - 1) / num_k_blocks; - - // And round UP to the K unroll level required. - k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll; - k_block *= strat.k_unroll; - - // x_block: Work out how many rows (of length k_block) will fit in the L2 - x_block = L2_size / (sizeof(Toi) * k_block); - - // Needs to be a multiple of the kernel output width. - x_block /= strat.out_width; - x_block *= strat.out_width; - - // And tune to the presented problem size. - int num_x_blocks = (N + (x_block - 1)) / x_block; - x_block = (N + num_x_blocks - 1) / num_x_blocks; - - x_block = (x_block + strat.out_width - 1) / strat.out_width; - x_block *= strat.out_width; - - // Work out the rounded size of M - needed for some buffers. - Mround = (M + (strat.out_height - 1)) / strat.out_height; - Mround *= strat.out_height; - - } - - // Actually execute the GEMM. - void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { - assert(working_space); - profiler prof; - int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); - intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes); - size_t diff = 0; - - if (working_space_int & 0xF) { - diff = 0x10 - (working_space_int & 0xF); - } - - Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff); - Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff); - Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff); - - for (unsigned int k0=0; k0<K; k0 += k_block) { - unsigned int kmax = k0 + k_block; - if (kmax > K) kmax = K; - - // Figure out how many "K" the kernel will actually process. - int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll; - kern_k *= strat.k_unroll; - - prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) { - if (trA ^ strategy::A_transpose) { - Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax); - } else { - Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax); - } - }); - - for (unsigned int x0=0; x0<N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) xmax = N; - - int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width; - - prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) { - if (trB ^ strategy::B_transpose) { - Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax); - } else { - Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax); - } - }); - - for (unsigned int y=0; y<M; y+=strat.out_height) { - unsigned int ymax = y + strat.out_height; - if (ymax > M) ymax = M; - - prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); - prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); }); - } - } - } - } -}; diff --git a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp deleted file mode 100644 index 098fdaa7ac..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include <stdio.h> - -#include "gemm_common.hpp" - -#include "profiler.hpp" -#include "transform.hpp" -#include "mergeresults.hpp" - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This is implementation is for GEMV with a transposed matrix. -// -// By default the source data is used in-place, but if type conversion is -// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED). - -template<typename strategy, typename To, typename Tr> -class GemvTransposed : public GemmCommon<To, Tr> { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - const unsigned int N; - const unsigned int K; - - const strategy strat; - - unsigned int m_block; - unsigned int n_block; - - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * m_block); - } - - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * m_block * n_block); - } - - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * n_block); - } - -public: - size_t get_working_size() const override { - return get_a_working_size() + get_b_working_size() + get_c_working_size(); - } - - GemvTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K) : N(N), K(K), strat(ci) { - /* For now don't do any blocking. TODO: figure out if we should. */ - m_block = K; - n_block = N; - } - - // Actually execute the GEMV. - void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { - profiler prof; - - static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same."); - static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same."); - - for (unsigned int m0=0; m0<K; m0+=m_block) { - unsigned int mmax = m0 + m_block; - if (mmax > K) mmax = K; - - for (unsigned int n0=0; n0<N; n0+=n_block) { - unsigned int nmax = n0 + n_block; - if (nmax > N) nmax = N; - - prof(PROFILE_KERNEL, ((mmax-m0) * (nmax-n0)), [&](void) { strat.kernel(B + (m0 * ldb) + n0, A + m0, C + n0, alpha, ldb, (mmax-m0), (nmax-n0)); }); - } - } - } -}; diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp deleted file mode 100644 index d78d33c647..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -// Actual kernel implementations -#include "a32_sgemm_8x6/a53.hpp" -#include "a32_sgemm_8x6/a55r1.hpp" -#include "a32_sgemm_8x6/generic.hpp" - -// 8x6 SGEMM "strategy" class. -// -// This describes the characteristics of a family of kernels, in terms of -// the required interleave properties and the output block size. -// -// All kernels in the family must share these characteristics. The actual -// kernel to be used can be chosen at runtime, based on the CPU_type -// structure. -class sgemm_8x6 { -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, const float *, float *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 6; - static const int A_block = 1; - static const int A_transpose = 0; - - /* Same for B input */ - static const int B_interleave = 8; - static const int B_block = 1; - static const int B_transpose = 1; - - /* Kernel blocking parameters */ - static const int out_width = 8; - static const int out_height = 6; - static const int k_unroll = 1; - - kern_type kernel = nullptr; - - sgemm_8x6(const CPUInfo *ci) { - switch(ci->CPU) { - case CPUTarget::A53: - kernel = a32_sgemm_8x6_a53; - break; - - case CPUTarget::A55_DOT: - kernel = a32_sgemm_8x6_a55r1; - break; - - default: - kernel = a32_sgemm_8x6; - break; - } - } -}; - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp deleted file mode 100644 index 6bfbfc8742..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include <arm_neon.h> - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 8x6), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - printf("CIAO SONO IO, AMORE MIO!\n"); - - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - int tails = (K & 3); - if (tails == 0) { - tails = 4; - } - int k = ((K+3)/4) - 1; - - __asm __volatile ( - "vmov.i32 q4, #0\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]\n" - "vmov.i32 q5, #0\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]\n" - "vmov.i32 q6, #0\n" - "ldr r0, [%[a_ptr], #0x10]\n" - "vmov.i32 q7, #0\n" - "ldr r1, [%[a_ptr], #0x14]\n" - "vmov.i32 q8, #0\n" - ASM_PREFETCH("[%[a_ptr], #0x40]") - "vmov.i32 q9, #0\n" - ASM_PREFETCH("[%[b_ptr], #0x40]") - "vmov.i32 q10, #0\n" - ASM_PREFETCH("[%[a_ptr], #0x80]") - "vmov.i32 q11, #0\n" - ASM_PREFETCH("[%[b_ptr], #0x80]") - "vmov.i32 q12, #0\n" - "vmov.i32 q13, #0\n" - ASM_PREFETCH("[%[a_ptr], #0xC0]") - "vmov.i32 q14, #0\n" - ASM_PREFETCH("[%[b_ptr], #0XC0]") - "vmov.i32 q15, #0\n" - "cmp %[k], #0\n" - "beq 6f\n" - - "1:\n" - // Unroll 0 - "vldr d6, [%[b_ptr], #0x10]\n" - "vmov d2, r0, r1\n" - "vmla.f32 q4, q2, d0[0]\n" - "ldr r0, [%[b_ptr], #0x18]\n" - "vmla.f32 q5, q2, d0[1]\n" - "ldr r1, [%[b_ptr], #0x1C]\n" - "vmla.f32 q6, q2, d1[0]\n" - - "vldr d3, [%[a_ptr], #0x18]\n" - "vmov d7, r0, r1\n" - "vmla.f32 q7, q2, d1[1]\n" - ASM_PREFETCH("[%[a_ptr], #0x100]") - "vmla.f32 q8, q2, d2[0]\n" - "vmla.f32 q9, q2, d2[1]\n" - - "vldr d4, [%[b_ptr], #0x20]\n" - "vmla.f32 q10, q3, d0[0]\n" - "ldr r0, [%[b_ptr], #0x28]\n" - "vmla.f32 q11, q3, d0[1]\n" - "ldr r1, [%[b_ptr], #0x2C]\n" - "vmla.f32 q12, q3, d1[0]\n" - - "vldr d0, [%[a_ptr], #0x20]\n" - "vmov d5, r0, r1\n" - "vmla.f32 q13, q3, d1[1]\n" - "ldr r0, [%[a_ptr], #0x28]\n" - "vmla.f32 q14, q3, d2[0]\n" - "ldr r1, [%[a_ptr], #0x2C]\n" - "vmla.f32 q15, q3, d2[1]\n" - - // Unroll 1 - "vldr d6, [%[b_ptr], #0x30]\n" - "vmov d1, r0, r1\n" - "vmla.f32 q4, q2, d3[0]\n" - "ldr r0, [%[b_ptr], #0x38]\n" - "vmla.f32 q5, q2, d3[1]\n" - "ldr r1, [%[b_ptr], #0x3C]\n" - "vmla.f32 q6, q2, d0[0]\n" - - "vldr d2, [%[a_ptr], #0x30]\n" - "vmov d7, r0, r1\n" - "vmla.f32 q7, q2, d0[1]\n" - ASM_PREFETCH("[%[b_ptr], #0x100]") - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q9, q2, d1[1]\n" - - "vldr d4, [%[b_ptr], #0x40]\n" - "vmla.f32 q10, q3, d3[0]\n" - "ldr r0, [%[b_ptr], #0x48]\n" - "vmla.f32 q11, q3, d3[1]\n" - "ldr r1, [%[b_ptr], #0x4C]\n" - "vmla.f32 q12, q3, d0[0]\n" - - "vldr d3, [%[a_ptr], #0x38]\n" - "vmov d5, r0, r1\n" - "vmla.f32 q13, q3, d0[1]\n" - "ldr r0, [%[a_ptr], #0x40]\n" - "vmla.f32 q14, q3, d1[0]\n" - "ldr r1, [%[a_ptr], #0x44]\n" - "vmla.f32 q15, q3, d1[1]\n" - - // Unroll 2 - "vldr d6, [%[b_ptr], #0x50]\n" - "vmov d0, r0, r1\n" - "vmla.f32 q4, q2, d2[0]\n" - "ldr r0, [%[b_ptr], #0x58]\n" - "vmla.f32 q5, q2, d2[1]\n" - "ldr r1, [%[b_ptr], #0x5C]\n" - "vmla.f32 q6, q2, d3[0]\n" - - "vldr d1, [%[a_ptr], #0x48]\n" - "vmov d7, r0, r1\n" - "vmla.f32 q7, q2, d3[1]\n" - ASM_PREFETCH("[%[a_ptr], #0x140]") - "vmla.f32 q8, q2, d0[0]\n" - "vmla.f32 q9, q2, d0[1]\n" - - "vldr d4, [%[b_ptr], #0x60]\n" - "vmla.f32 q10, q3, d2[0]\n" - "ldr r0, [%[b_ptr], #0x68]\n" - "vmla.f32 q11, q3, d2[1]\n" - "ldr r1, [%[b_ptr], #0x6C]\n" - "vmla.f32 q12, q3, d3[0]\n" - - "vldr d2, [%[a_ptr], #0x50]\n" - "vmov d5, r0, r1\n" - "vmla.f32 q13, q3, d3[1]\n" - "ldr r0, [%[a_ptr], #0x58]\n" - "vmla.f32 q14, q3, d0[0]\n" - "ldr r1, [%[a_ptr], #0x5C]\n" - "vmla.f32 q15, q3, d0[1]\n" - "add %[a_ptr], %[a_ptr], #0x60\n" - - // Unroll 3 - "vldr d6, [%[b_ptr], #0x70]\n" - "vmov d3, r0, r1\n" - "vmla.f32 q4, q2, d1[0]\n" - "ldr r0, [%[b_ptr], #0x78]\n" - "vmla.f32 q5, q2, d1[1]\n" - "ldr r1, [%[b_ptr], #0x7C]\n" - "vmla.f32 q6, q2, d2[0]\n" - "add %[b_ptr], %[b_ptr], #0x80\n" - - "vldr d0, [%[a_ptr], #0x00]\n" - "vmov d7, r0, r1\n" - "vmla.f32 q7, q2, d2[1]\n" - ASM_PREFETCH("[%[b_ptr], #0xC0]") - "vmla.f32 q8, q2, d3[0]\n" - "vmla.f32 q9, q2, d3[1]\n" - - "vldr d4, [%[b_ptr], #0x00]\n" - "vmla.f32 q10, q3, d1[0]\n" - "ldr r0, [%[b_ptr], #0x08]\n" - "vmla.f32 q11, q3, d1[1]\n" - "ldr r1, [%[b_ptr], #0x0C]\n" - "vmla.f32 q12, q3, d2[0]\n" - "subs %[k], %[k], #1\n" - - "vldr d1, [%[a_ptr], #0x08]\n" - "vmov d5, r0, r1\n" - "vmla.f32 q13, q3, d2[1]\n" - "ldr r0, [%[a_ptr], #0x10]\n" - "vmla.f32 q14, q3, d3[0]\n" - "ldr r1, [%[a_ptr], #0x14]\n" - "vmla.f32 q15, q3, d3[1]\n" - "bne 1b\n" - - // "Tails" shows how many multiply blocks are needed at the - // end, must be 1-4 inclusive. Bail out to alternative tail - // immediately if it's 1. - "6:\n" - "subs %[tails], %[tails], #1\n" - "beq 3f\n" - - // Detached final iteration - for now adapt the generic - // tails rather than reimplementing for A53. - - // Unroll 0 - "vmov d2, r0, r1\n" - "add %[a_ptr], %[a_ptr], #0x18\n" - "vmla.f32 q4, q2, d0[0]\n" - "vld1.32 {d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q5, q2, d0[1]\n" - "add %[b_ptr], %[b_ptr], #0x10\n" - "vmla.f32 q6, q2, d1[0]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "vmla.f32 q7, q2, d1[1]\n" - "vmla.f32 q8, q2, d2[0]\n" - "subs %[tails], %[tails], #1\n" - "vmla.f32 q9, q2, d2[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d0[0]\n" - "vmla.f32 q11, q3, d0[1]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q3, d1[1]\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmla.f32 q14, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "beq 4f\n" - - // Unroll 1 - "vmla.f32 q4, q2, d3[0]\n" - "vmla.f32 q5, q2, d3[1]\n" - "subs %[tails], %[tails], #1\n" - "vmla.f32 q6, q2, d0[0]\n" - "vmla.f32 q7, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q9, q2, d1[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d3[0]\n" - "vmla.f32 q11, q3, d3[1]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q12, q3, d0[0]\n" - "vmla.f32 q13, q3, d0[1]\n" - "vmla.f32 q14, q3, d1[0]\n" - "vmla.f32 q15, q3, d1[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "beq 5f\n" - - // Unroll 2 - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmla.f32 q4, q2, d2[0]\n" - "vmla.f32 q5, q2, d2[1]\n" - "vmla.f32 q6, q2, d3[0]\n" - "vmla.f32 q7, q2, d3[1]\n" - "vmla.f32 q8, q2, d0[0]\n" - "vmla.f32 q9, q2, d0[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d2[0]\n" - "vmla.f32 q11, q3, d2[1]\n" - "vmla.f32 q12, q3, d3[0]\n" - "vmla.f32 q13, q3, d3[1]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q14, q3, d0[0]\n" - "vmla.f32 q15, q3, d0[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - - // Unroll 3 - "vmla.f32 q4, q2, d1[0]\n" - "vmla.f32 q10, q3, d1[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q5, q2, d1[1]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d1[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q6, q2, d2[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d2[0]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d2[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d2[1]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d3[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d3[0]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d3[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d3[1]\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "b 2f\n" - - // tails==1 final tail - "3:\n" - "vmov d2, r0, r1\n" - "add %[b_ptr], %[b_ptr], #0x10\n" - "vmla.f32 q4, q2, d0[0]\n" - "add %[a_ptr], %[a_ptr], #0x18\n" - "vmla.f32 q5, q2, d0[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "vmla.f32 q6, q2, d1[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q10, q3, d0[0]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d0[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d1[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d1[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d1[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d2[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d2[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d2[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d2[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "b 2f\n" - - // tails==2 final tail - "4:\n" - "vmla.f32 q4, q2, d3[0]\n" - "vmla.f32 q10, q3, d3[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q5, q2, d3[1]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d3[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q6, q2, d0[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d0[0]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d0[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d0[1]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d1[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d1[0]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d1[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d1[1]\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "b 2f\n" - - // tails==3 final tail - "5:\n" - "vmla.f32 q4, q2, d2[0]\n" - "vld1.32 {d0}, [%[a_ptr] :64]!\n" - "vmla.f32 q5, q2, d2[1]\n" - "vmla.f32 q6, q2, d3[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q10, q3, d2[0]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d2[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d3[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d3[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d3[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d0[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d0[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d0[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d0[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - - "2:\n" - "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1" - ); - } - } -} - -#endif diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp deleted file mode 100644 index 4f0ef7cd21..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include <arm_neon.h> - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 8x6), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - /* Work out starting values for "k" and "tails" in the inner loop. */ - int tails_initial = (K & 3); - if (tails_initial == 0) { - tails_initial = 4; - } - - int k_initial = ((K+3)/4) - 1; - - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - int tails = tails_initial; - int k = k_initial; - - a_ptr = a_ptr0; - - __asm __volatile ( - "vldr d0, [%[a_ptr]]\n" - "vmov.i32 q4, #0\n" - "vldr d1, [%[a_ptr], #0x08]\n" - "vmov.i32 q5, #0\n" - "vldr d4, [%[b_ptr]]\n" - "vmov.i32 q6, #0\n" - "vldr d5, [%[b_ptr], #0x08]\n" - "vmov.i32 q7, #0\n" - "vldr d2, [%[a_ptr], #0x10]\n" - "vmov.i32 q8, #0\n" - ASM_PREFETCH("[%[b_ptr], #0x40]") - "vmov.i32 q9, #0\n" - ASM_PREFETCH("[%[a_ptr], #0x40]") - "vmov.i32 q10, #0\n" - ASM_PREFETCH("[%[b_ptr], #0x80]") - "vmov.i32 q11, #0\n" - ASM_PREFETCH("[%[a_ptr], #0x80]") - "vmov.i32 q12, #0\n" - ASM_PREFETCH("[%[b_ptr], #0XC0]") - "vmov.i32 q13, #0\n" - ASM_PREFETCH("[%[a_ptr], #0xC0]") - "vmov.i32 q14, #0\n" - ASM_PREFETCH("[%[b_ptr], #0x100]") - "vmov.i32 q15, #0\n" - ASM_PREFETCH("[%[a_ptr], #0x100]") - "cmp %[k], #0\n" - ASM_PREFETCH("[%[b_ptr], #0x140]") - "beq 6f\n" - ASM_PREFETCH("[%[b_ptr], #0x180]") - - "1:\n" - // Unroll 0 - "vmla.f32 q4, q2, d0[0]\n" - "vldr d6, [%[b_ptr], #0x10]\n" - "vmla.f32 q5, q2, d0[1]\n" - "vldr d7, [%[b_ptr], #0x18]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vldr d3, [%[a_ptr], #0x18]\n" - "vmla.f32 q7, q2, d1[1]\n" - ASM_PREFETCH("[%[a_ptr], #0x140]") - "vmla.f32 q8, q2, d2[0]\n" - "subs %[k], %[k], #1\n" - "vmla.f32 q9, q2, d2[1]\n" - "vldr d4, [%[b_ptr], #0x20]\n" - "vmla.f32 q10, q3, d0[0]\n" - "vldr d5, [%[b_ptr], #0x28]\n" - "vmla.f32 q11, q3, d0[1]\n" - "vldr d0, [%[a_ptr], #0x20]\n" - "vmla.f32 q12, q3, d1[0]\n" - - "vmla.f32 q13, q3, d1[1]\n" - "vldr d1, [%[a_ptr], #0x28]\n" - "vmla.f32 q14, q3, d2[0]\n" - - "vmla.f32 q15, q3, d2[1]\n" - "vldr d6, [%[b_ptr], #0x30]\n" - - // Unroll 1 - "vmla.f32 q4, q2, d3[0]\n" - "vldr d7, [%[b_ptr], #0x38]\n" - "vmla.f32 q5, q2, d3[1]\n" - "vldr d2, [%[a_ptr], #0x30]\n" - "vmla.f32 q6, q2, d0[0]\n" - - "vmla.f32 q7, q2, d0[1]\n" - ASM_PREFETCH("[%[b_ptr], #0x1C0]") - "vmla.f32 q8, q2, d1[0]\n" - - "vmla.f32 q9, q2, d1[1]\n" - "vldr d4, [%[b_ptr], #0x40]\n" - "vmla.f32 q10, q3, d3[0]\n" - "vldr d5, [%[b_ptr], #0x48]\n" - "vmla.f32 q11, q3, d3[1]\n" - "vldr d3, [%[a_ptr], #0x38]\n" - "vmla.f32 q12, q3, d0[0]\n" - - "vmla.f32 q13, q3, d0[1]\n" - "vldr d0, [%[a_ptr], #0x40]\n" - "vmla.f32 q14, q3, d1[0]\n" - - "vmla.f32 q15, q3, d1[1]\n" - "vldr d6, [%[b_ptr], #0x50]\n" - - // Unroll 2 - "vmla.f32 q4, q2, d2[0]\n" - "vldr d7, [%[b_ptr], #0x58]\n" - "vmla.f32 q5, q2, d2[1]\n" - "vldr d1, [%[a_ptr], #0x48]\n" - "vmla.f32 q6, q2, d3[0]\n" - - "vmla.f32 q7, q2, d3[1]\n" - ASM_PREFETCH("[%[a_ptr], #0x180]") - "vmla.f32 q8, q2, d0[0]\n" - - "vmla.f32 q9, q2, d0[1]\n" - "vldr d4, [%[b_ptr], #0x60]\n" - "vmla.f32 q10, q3, d2[0]\n" - "vldr d5, [%[b_ptr], #0x68]\n" - "vmla.f32 q11, q3, d2[1]\n" - "vldr d2, [%[a_ptr], #0x50]\n" - "vmla.f32 q12, q3, d3[0]\n" - - "vmla.f32 q13, q3, d3[1]\n" - "vldr d3, [%[a_ptr], #0x58]\n" - "vmla.f32 q14, q3, d0[0]\n" - "add %[a_ptr], %[a_ptr], #0x60\n" - "vmla.f32 q15, q3, d0[1]\n" - "vldr d6, [%[b_ptr], #0x70]\n" - - // Unroll 3 - "vmla.f32 q4, q2, d1[0]\n" - "vldr d7, [%[b_ptr], #0x78]\n" - "vmla.f32 q5, q2, d1[1]\n" - "add %[b_ptr], %[b_ptr], #0x80\n" - "vmla.f32 q6, q2, d2[0]\n" - "vldr d0, [%[a_ptr], #0x00]\n" - "vmla.f32 q7, q2, d2[1]\n" - ASM_PREFETCH("[%[b_ptr], #0x180]") - "vmla.f32 q8, q2, d3[0]\n" - - "vmla.f32 q9, q2, d3[1]\n" - "vldr d4, [%[b_ptr], #0x00]\n" - "vmla.f32 q10, q3, d1[0]\n" - "vldr d5, [%[b_ptr], #0x08]\n" - "vmla.f32 q11, q3, d1[1]\n" - "vldr d1, [%[a_ptr], #0x08]\n" - "vmla.f32 q12, q3, d2[0]\n" - - "vmla.f32 q13, q3, d2[1]\n" - "vldr d2, [%[a_ptr], #0x10]\n" - "vmla.f32 q14, q3, d3[0]\n" - - "vmla.f32 q15, q3, d3[1]\n" - "bne 1b\n" - - // "Tails" shows how many multiply blocks are needed at the - // end, must be 1-4 inclusive. Bail out to alternative tail - // immediately if it's 1. - "6:\n" - "subs %[tails], %[tails], #1\n" - "beq 3f\n" - - // Detached final iteration - - // Unroll 0 - "vmla.f32 q4, q2, d0[0]\n" - "vldr d6, [%[b_ptr], #0x10]\n" - "vmla.f32 q5, q2, d0[1]\n" - "vldr d7, [%[b_ptr], #0x18]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vldr d3, [%[a_ptr], #0x18]\n" - "vmla.f32 q7, q2, d1[1]\n" - "subs %[tails], %[tails], #1\n" - "vmla.f32 q8, q2, d2[0]\n" - "vmla.f32 q9, q2, d2[1]\n" - "vldr d4, [%[b_ptr], #0x20]\n" - - "vmla.f32 q10, q3, d0[0]\n" - "vldr d5, [%[b_ptr], #0x28]\n" - "vmla.f32 q11, q3, d0[1]\n" - "vldr d0, [%[a_ptr], #0x20]\n" - "vmla.f32 q12, q3, d1[0]\n" - "add %[b_ptr], %[b_ptr], #0x30\n" - "vmla.f32 q13, q3, d1[1]\n" - "vldr d1, [%[a_ptr], #0x28]\n" - "vmla.f32 q14, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - "beq 4f\n" - - // Unroll 1 - "vmla.f32 q4, q2, d3[0]\n" - "vldr d6, [%[b_ptr], #0x30]\n" - "vmla.f32 q5, q2, d3[1]\n" - "vldr d7, [%[b_ptr], #0x38]\n" - "vmla.f32 q6, q2, d0[0]\n" - "vldr d2, [%[a_ptr], #0x30]\n" - "vmla.f32 q7, q2, d0[1]\n" - "subs %[tails], %[tails], #1\n" - "vmla.f32 q8, q2, d1[0]\n" - - "vmla.f32 q9, q2, d1[1]\n" - - "vmla.f32 q10, q3, d3[0]\n" - "vldr d4, [%[b_ptr], #0x40]\n" - "vmla.f32 q11, q3, d3[1]\n" - "vldr d5, [%[b_ptr], #0x48]\n" - "vmla.f32 q12, q3, d0[0]\n" - "vldr d3, [%[a_ptr], #0x38]\n" - "vmla.f32 q13, q3, d0[1]\n" - "vldr d0, [%[a_ptr], #0x40]\n" - "vmla.f32 q14, q3, d1[0]\n" - "vmla.f32 q15, q3, d1[1]\n" - "beq 5f\n" - - // Unroll 2 - "vmla.f32 q4, q2, d2[0]\n" - "vldr d6, [%[b_ptr], #0x50]\n" - "vmla.f32 q5, q2, d2[1]\n" - "vldr d7, [%[b_ptr], #0x58]\n" - "vmla.f32 q6, q2, d3[0]\n" - "vldr d1, [%[a_ptr], #0x48]\n" - "vmla.f32 q7, q2, d3[1]\n" - "vmla.f32 q8, q2, d0[0]\n" - "vmla.f32 q9, q2, d0[1]\n" - - "vmla.f32 q10, q3, d2[0]\n" - "vldr d4, [%[b_ptr], #0x60]\n" - "vmla.f32 q11, q3, d2[1]\n" - "vldr d5, [%[b_ptr], #0x68]\n" - "vmla.f32 q12, q3, d3[0]\n" - "vldr d2, [%[a_ptr], #0x50]\n" - "vmla.f32 q13, q3, d3[1]\n" - "vldr d3, [%[a_ptr], #0x58]\n" - "vmla.f32 q14, q3, d0[0]\n" - "vmla.f32 q15, q3, d0[1]\n" - - // Unroll 3 - "vmla.f32 q4, q2, d1[0]\n" - "vldr d6, [%[b_ptr], #0x70]\n" - "vmla.f32 q5, q2, d1[1]\n" - "vldr d7, [%[b_ptr], #0x78]\n" - "vmla.f32 q10, q3, d1[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d1[1]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q6, q2, d2[0]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d2[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d2[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d2[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d3[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d3[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d3[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d3[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "add %[a_ptr], %[a_ptr], #0x60\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "add %[b_ptr], %[b_ptr], #0x80\n" - "b 2f\n" - - // tails==1 final tail - "3:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vldr d6, [%[b_ptr], #0x10]\n" - "vmla.f32 q5, q2, d0[1]\n" - "vldr d7, [%[b_ptr], #0x18]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q10, q3, d0[0]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d0[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d1[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d1[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d1[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d2[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d2[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d2[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d2[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "add %[a_ptr], %[a_ptr], #0x18\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "add %[b_ptr], %[b_ptr], #0x20\n" - "b 2f\n" - - // tails==2 final tail - "4:\n" - "vmla.f32 q4, q2, d3[0]\n" - "vldr d6, [%[b_ptr], #0x30]\n" - "vmla.f32 q5, q2, d3[1]\n" - "vldr d7, [%[b_ptr], #0x38]\n" - "vmla.f32 q10, q3, d3[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d3[1]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q6, q2, d0[0]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d0[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d0[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d0[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d1[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d1[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d1[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d1[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "add %[b_ptr], %[b_ptr], #0x40\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "add %[a_ptr], %[a_ptr], #0x30\n" - "b 2f\n" - - // tails==3 final tail - "5:\n" - "vmla.f32 q4, q2, d2[0]\n" - "vldr d6, [%[b_ptr], #0x50]\n" - "vmla.f32 q5, q2, d2[1]\n" - "vldr d7, [%[b_ptr], #0x58]\n" - "vmla.f32 q6, q2, d3[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q10, q3, d2[0]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d2[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d3[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d3[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d3[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d0[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d0[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d0[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d0[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "add %[a_ptr], %[a_ptr], #0x48\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "add %[b_ptr], %[b_ptr], #0x60\n" - - "2:\n" - "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1" - ); - } - } -} - -#endif /* __arm__ */ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp deleted file mode 100644 index 7a44fed5b2..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include "../../asmlib.hpp" - -#include <arm_neon.h> - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 8x6), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - int tails = (K & 3); - if (tails == 0) { - tails = 4; - } - int k = ((K+3)/4) - 1; - - __asm __volatile ( - "vmov.i32 q4, #0\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmov.i32 q5, #0\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - "vmov.i32 q6, #0\n" - ASM_PREFETCH("[%[a_ptr], #48]") - "vmov.i32 q7, #0\n" - ASM_PREFETCH("[%[b_ptr], #48]") - "vmov.i32 q8, #0\n" - ASM_PREFETCH("[%[a_ptr], #112]") - "vmov.i32 q9, #0\n" - ASM_PREFETCH("[%[b_ptr], #112]") - "vmov.i32 q10, #0\n" - "vmov.i32 q11, #0\n" - "vmov.i32 q12, #0\n" - "vmov.i32 q13, #0\n" - ASM_PREFETCH("[%[a_ptr], #176]") - "vmov.i32 q14, #0\n" - ASM_PREFETCH("[%[b_ptr], #176]") - "vmov.i32 q15, #0\n" - - "cmp %[k], #0\n" - "beq 6f\n" - - "1:\n" - // Unroll 0 - "vmla.f32 q4, q2, d0[0]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q5, q2, d0[1]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "vmla.f32 q7, q2, d1[1]\n" - "vmla.f32 q8, q2, d2[0]\n" - "vmla.f32 q9, q2, d2[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d0[0]\n" - "vmla.f32 q11, q3, d0[1]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q3, d1[1]\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmla.f32 q14, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - - // Unroll 1 - "vmla.f32 q4, q2, d3[0]\n" - "subs %[k], %[k], #1\n" - "vmla.f32 q5, q2, d3[1]\n" - ASM_PREFETCH("[%[a_ptr], #208]") - "vmla.f32 q6, q2, d0[0]\n" - "vmla.f32 q7, q2, d0[1]\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q9, q2, d1[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d3[0]\n" - "vmla.f32 q11, q3, d3[1]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q12, q3, d0[0]\n" - "vmla.f32 q13, q3, d0[1]\n" - "vmla.f32 q14, q3, d1[0]\n" - "vmla.f32 q15, q3, d1[1]\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - - // Unroll 2 - "vmla.f32 q4, q2, d2[0]\n" - "vmla.f32 q5, q2, d2[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "vmla.f32 q6, q2, d3[0]\n" - "vmla.f32 q7, q2, d3[1]\n" - ASM_PREFETCH("[%[a_ptr], #240]") - "vmla.f32 q8, q2, d0[0]\n" - "vmla.f32 q9, q2, d0[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d2[0]\n" - "vmla.f32 q11, q3, d2[1]\n" - ASM_PREFETCH("[%[b_ptr], #208]") - "vmla.f32 q12, q3, d3[0]\n" - "vmla.f32 q13, q3, d3[1]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q14, q3, d0[0]\n" - "vmla.f32 q15, q3, d0[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - - // Unroll 3 - "vmla.f32 q4, q2, d1[0]\n" - "vmla.f32 q5, q2, d1[1]\n" - "vmla.f32 q6, q2, d2[0]\n" - "vmla.f32 q7, q2, d2[1]\n" - "vmla.f32 q8, q2, d3[0]\n" - "vmla.f32 q9, q2, d3[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmla.f32 q12, q3, d2[0]\n" - "vmla.f32 q13, q3, d2[1]\n" - "vmla.f32 q14, q3, d3[0]\n" - "vmla.f32 q15, q3, d3[1]\n" - "bne 1b\n" - - // Branch here if we never execute main loop. - "6:\n" - - // "Tails" shows how many multiply blocks are needed at the - // end, must be 1-4 inclusive. Bail out to alternative tail - // immediately if it's 1. - "subs %[tails], %[tails], #1\n" - "beq 3f\n" - - // Detached final iteration - // Unroll 0 - "vmla.f32 q4, q2, d0[0]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q5, q2, d0[1]\n" - "vmla.f32 q6, q2, d1[0]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "vmla.f32 q7, q2, d1[1]\n" - "vmla.f32 q8, q2, d2[0]\n" - "subs %[tails], %[tails], #1\n" - "vmla.f32 q9, q2, d2[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d0[0]\n" - "vmla.f32 q11, q3, d0[1]\n" - "vmla.f32 q12, q3, d1[0]\n" - "vmla.f32 q13, q3, d1[1]\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmla.f32 q14, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "beq 4f\n" - - // Unroll 1 - "vmla.f32 q4, q2, d3[0]\n" - "vmla.f32 q5, q2, d3[1]\n" - "subs %[tails], %[tails], #1\n" - "vmla.f32 q6, q2, d0[0]\n" - "vmla.f32 q7, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q9, q2, d1[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d3[0]\n" - "vmla.f32 q11, q3, d3[1]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q12, q3, d0[0]\n" - "vmla.f32 q13, q3, d0[1]\n" - "vmla.f32 q14, q3, d1[0]\n" - "vmla.f32 q15, q3, d1[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "beq 5f\n" - - // Unroll 2 - "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" - "vmla.f32 q4, q2, d2[0]\n" - "vmla.f32 q5, q2, d2[1]\n" - "vmla.f32 q6, q2, d3[0]\n" - "vmla.f32 q7, q2, d3[1]\n" - "vmla.f32 q8, q2, d0[0]\n" - "vmla.f32 q9, q2, d0[1]\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" - - "vmla.f32 q10, q3, d2[0]\n" - "vmla.f32 q11, q3, d2[1]\n" - "vmla.f32 q12, q3, d3[0]\n" - "vmla.f32 q13, q3, d3[1]\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" - "vmla.f32 q14, q3, d0[0]\n" - "vmla.f32 q15, q3, d0[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - - // Unroll 3 - "vmla.f32 q4, q2, d1[0]\n" - "vmla.f32 q10, q3, d1[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q5, q2, d1[1]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d1[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q6, q2, d2[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d2[0]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d2[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d2[1]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d3[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d3[0]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d3[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d3[1]\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "b 2f\n" - - // tails==1 final tail - "3:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vld1.32 {d2}, [%[a_ptr] :64]!\n" - "vmla.f32 q5, q2, d0[1]\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" - "vmla.f32 q6, q2, d1[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q10, q3, d0[0]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d0[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d1[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d1[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d1[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d2[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d2[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d2[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d2[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "b 2f\n" - - // tails==2 final tail - "4:\n" - "vmla.f32 q4, q2, d3[0]\n" - "vmla.f32 q10, q3, d3[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q5, q2, d3[1]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d3[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q6, q2, d0[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d0[0]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d0[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d0[1]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d1[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d1[0]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d1[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d1[1]\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - "b 2f\n" - - // tails==3 final tail - "5:\n" - "vmla.f32 q4, q2, d2[0]\n" - "vld1.32 {d0}, [%[a_ptr] :64]!\n" - "vmla.f32 q5, q2, d2[1]\n" - "vmla.f32 q6, q2, d3[0]\n" - "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" - "vmla.f32 q10, q3, d2[0]\n" - "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" - "vmla.f32 q11, q3, d2[1]\n" - "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" - "vmla.f32 q12, q3, d3[0]\n" - "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" - "vmla.f32 q7, q2, d3[1]\n" - "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" - "vmla.f32 q13, q3, d3[1]\n" - "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" - "vmla.f32 q8, q2, d0[0]\n" - "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" - "vmla.f32 q14, q3, d0[0]\n" - "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" - "vmla.f32 q9, q2, d0[1]\n" - "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" - "vmla.f32 q15, q3, d0[1]\n" - "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" - "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" - - "2:\n" - "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp deleted file mode 100644 index f7659b9a67..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Actual kernel implementations -#include "a64_gemm_s16_12x8/generic.hpp" - -// 12x8 SGEMM "strategy" class. -// -// This describes the characteristics of a family of kernels, in terms of -// the required interleave properties and the output block size. -// -// All kernels in the family must share these characteristics. The actual -// kernel to be used can be chosen at runtime, based on the CPU_type -// structure. -class gemm_s16_12x8 { -public: - typedef int16_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 1; - static const int A_transpose = 0; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 1; - static const int B_transpose = 1; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 1; - - kern_type kernel = nullptr; - - gemm_s16_12x8(const CPUInfo *ci) { - kernel = a64_gemm_s16_asimd_12x8; - } -}; - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp deleted file mode 100644 index 10259b2fdf..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once -#include <arm_neon.h> - -inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) -{ - const int16_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - for (int yb = 0; yb < ablocks; yb++) - { - const int16_t *a_ptr0 = a_ptr; - const int16_t *b_ptr = Bpanel; - - for (int xb = 0; xb < bblocks; xb++) - { - a_ptr = a_ptr0; - const bool odd_k = K & 0x1; - int k = (K+1)/2 - 1; - - register int16x8_t aa asm("v0"); - register int16x8_t ab asm("v1"); - register int16x8_t b0 asm("v2"); - register int16x8_t b1 asm("v3"); - register int16x8_t b2 asm("v4"); - - __asm __volatile ( - "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower - "movi v5.4s, #0\n" - "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper - "movi v6.4s, #0\n" - "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper - "movi v7.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v8.4s, #0\n" - "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper - "movi v9.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v10.4s, #0\n" - "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper - "movi v11.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #96]") - "movi v12.4s, #0\n" - "movi v13.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #96]") - "movi v14.4s, #0\n" - "movi v15.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0\n" - "movi v17.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v18.4s, #0\n" - "movi v19.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #160]") - "movi v20.4s, #0\n" - "movi v21.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #160]") - "movi v22.4s, #0\n" - "movi v23.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v24.4s, #0\n" - "add %x[a_ptr], %x[a_ptr], #0x10\n" - "movi v25.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v26.4s, #0\n" - "add %x[b_ptr], %x[b_ptr], #0x18\n" - "movi v27.4s, #0\n" - "movi v28.4s, #0\n" - - "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. - - "1:\n" // Main loop - // First unroll - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - // Second unroll - "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper - "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper - "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "add %x[a_ptr], %x[a_ptr], #0x20\n" - "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "subs %x[k], %x[k], #0x1\n" - "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper - "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "add %x[b_ptr], %x[b_ptr], #0x30\n" - "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "bne 1b\n" - - "2:\n" // Even tail - "cbnz %x[odd_k], 3f\n" - - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "add %[a_ptr], %[a_ptr], #0x10\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "add %[b_ptr], %[b_ptr], #0x18\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" - "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "str q19, [%x[c_ptr], #0x130]\n" - "b 4f\n" // Complete write out - - "3:\n" // Odd tail - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - - "4:\n" // End of function - "str q19, [%x[c_ptr], #0x130]\n" - "str q27, [%x[c_ptr], #0x140]\n" - "str q12, [%x[c_ptr], #0x150]\n" - "str q20, [%x[c_ptr], #0x160]\n" - "str q28, [%x[c_ptr], #0x170]\n" - "add %x[c_ptr], %x[c_ptr], #0x180\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), - [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) - : [odd_k] "r" (odd_k) - : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp deleted file mode 100644 index 88cbb361b3..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Load the actual kernel -#include "a64_gemm_s8_12x8/generic.hpp" - -class gemm_s8_12x8 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 4; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 4; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 4; - - kern_type kernel = nullptr; - - gemm_s8_12x8(const CPUInfo *ci) { - kernel = a64_gemm_s8_12x8; - } -}; - -#endif // __aarch64__ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp deleted file mode 100644 index 4ac2ba4234..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "dot_toolchain_support.h" -#include <cassert> - -void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - // We divide K by 4 because the sdot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make. - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb<ablocks; yb++) { - const int8_t *a_ptr0 = a_ptr; - const int8_t *b_ptr = Bpanel; - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - int k = init_value_k; - register int32x4_t a0 asm("v0"); - register int32x4_t a1 asm("v1"); - register int32x4_t b0 asm("v2"); - register int32x4_t b1 asm("v3"); - register int32x4_t b2 asm("v4"); - register int32x4_t a0a asm("v5"); - register int32x4_t a1a asm("v6"); - __asm __volatile ( - _DECLARE_SDOT - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldp %q[a0], %q[a1], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldp %q[b0], %q[b1], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - - // Loop proper - "1:\n" - "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %d[a0a], [%[a_ptr], #32]\n" - - "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "ins %[b2].d[1], x20\n" - "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %d[a1a], [%[a_ptr], #48]\n" - - - "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "ins %[a0a].d[1], x20\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "ins %[a1a].d[1], x20\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "ins %[b0].d[1], x20\n" - "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" - - "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" - - "ldr %d[b2], [%[b_ptr], #80]\n" - - "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - "ins %[b1].d[1], x20\n" - "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "ldr %d[a0], [%[a_ptr], #64]\n" - - "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "ins %[b2].d[1], x20\n" - "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "ldr x20, [%[a_ptr], #72]\n" - "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "ldr %d[a1], [%[a_ptr], #80]\n" - - "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "ins %[a0].d[1], x20\n" - ASM_PREFETCH("[%[b_ptr], #512]") - "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - "ldr x20, [%[a_ptr], #88]\n" - "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "ldr %d[b0], [%[b_ptr], #96]\n" - - "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "ins %[a1].d[1], x20\n" - "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "ldr x20, [%[b_ptr], #104]\n" - "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "ldr %d[b1], [%[b_ptr], #112]\n" - - "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "ins %[b0].d[1], x20\n" - "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "ldr x20, [%[b_ptr], #120]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - - "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "subs %w[k], %w[k], #1\n" - "ins %[b1].d[1], x20\n" - "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "bne 1b\n" - - // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) - "4:\n" - - // Branch to alternative tail for odd K - "cbnz %w[oddk], 2f\n" - - // Detached final iteration (even K) - "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %d[a0a], [%[a_ptr], #32]\n" - - "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %d[a1a], [%[a_ptr], #48]\n" - - - "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "ins %[a0a].d[1], x20\n" - "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "ins %[a1a].d[1], x20\n" - "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" - - "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "ins %[b0].d[1], x20\n" - "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "ldr %d[b2], [%[b_ptr], #80]\n" - - "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - "ins %[b1].d[1], x20\n" - "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "ins %[b2].d[1], x20\n" - - "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "str q8, [%[c_ptr], #0]\n" - "str q16, [%[c_ptr], #16]\n" - "str q24, [%[c_ptr], #32]\n" - "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - - "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - "str q17, [%[c_ptr], #64]\n" - "str q25, [%[c_ptr], #80]\n" - "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - "str q18, [%[c_ptr], #112]\n" - "str q26, [%[c_ptr], #128]\n" - "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - "str q19, [%[c_ptr], #160]\n" - "str q27, [%[c_ptr], #176]\n" - "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - "str q20, [%[c_ptr], #208]\n" - "str q28, [%[c_ptr], #224]\n" - "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - "str q21, [%[c_ptr], #256]\n" - "str q29, [%[c_ptr], #272]\n" - "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - "str q22, [%[c_ptr], #304]\n" - "str q30, [%[c_ptr], #320]\n" - "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - "b 3f\n" - - // Detached final iteration (odd K) - "2:\n" - "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "ldr x20, [%[b_ptr], #40]\n" - - "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "str q8, [%[c_ptr], #0]\n" - "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "str q16, [%[c_ptr], #16]\n" - "ins %[b2].d[1], x20\n" - "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "add %[a_ptr], %[a_ptr], #32\n" - "str q24, [%[c_ptr], #32]\n" - "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - - "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "str q17, [%[c_ptr], #64]\n" - "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "str q25, [%[c_ptr], #80]\n" - "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "str q18, [%[c_ptr], #112]\n" - "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "str q26, [%[c_ptr], #128]\n" - "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "str q19, [%[c_ptr], #160]\n" - "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "str q27, [%[c_ptr], #176]\n" - "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "str q20, [%[c_ptr], #208]\n" - "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "str q28, [%[c_ptr], #224]\n" - "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "str q21, [%[c_ptr], #256]\n" - "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "str q29, [%[c_ptr], #272]\n" - "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "str q22, [%[c_ptr], #304]\n" - "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "str q30, [%[c_ptr], #320]\n" - "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - - // Common tail - "3:\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - - - - ".purgem sdot\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - - - } - } -} - -#endif - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h deleted file mode 100644 index 1d6fd1623e..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -// Define a macro to assemble the UDOT instruction (in the absence of toolchain support) -#define _DECLARE_SDOT ".altmacro\n"\ - ".macro sdot opd:req, opn:req, opm:req\n"\ - "local vd, vn, vm, h, l\n"\ - ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\ - ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\ - ".set vd,\\reg\n"\ - ".endif\n"\ - ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\ - ".set vn,\\reg\n"\ - ".endif\n"\ - ".irp idx,0,1,2,3\n"\ - ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\ - ".set vm,\\reg\n"\ - ".set h,\\idx / 2\n"\ - ".set l,\\idx %% 2\n"\ - ".endif\n"\ - ".endr\n"\ - ".endr\n"\ - ".ifndef vd\n"\ - ".error \"Bad operand \\opd\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef vn\n"\ - ".error \"Bad operand \\opn\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef vm\n"\ - ".error \"Bad operand \\opm\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef h\n"\ - ".error \"Bad operand \\opm\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef l\n"\ - ".error \"Bad operand \\opm\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\ - ".endm\n"\ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp deleted file mode 100644 index bfad0373b2..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp +++ /dev/null @@ -1,363 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "dot_toolchain_support.h" -#include <cassert> - - -inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - K/=4; - const long int row_jump=0; - const long int block_jump=0; - const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel); - int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel); - for (int yb=0; yb<ablocks; yb++) { - const int32_t *a_ptr0 = a_ptr; - const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel); - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k = ((K+1)/2) - 1; - register int32x4_t a0 asm("v0"); - register int32x4_t a1 asm("v1"); - register int32x4_t b0 asm("v2"); - register int32x4_t b1 asm("v3"); - register int32x4_t b2 asm("v4"); - register int32x4_t a0a asm("v5"); - register int32x4_t a1a asm("v6"); - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldr %q[a0], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "ldr %q[a1], [%[a_ptr], #16]\n" - "movi v11.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - _DECLARE_SDOT - - // Loop proper - "1:\n" - "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" - - "ldr %q[b2], [%[b_ptr], #32]\n" - "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %q[a0a], [%[a_ptr], #32]\n" - "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr %q[a1a], [%[a_ptr], #48]\n" - "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %q[b0], [%[b_ptr], #48]\n" - - "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "ldr %q[b1], [%[b_ptr], #64]\n" - - "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "ldr %q[b2], [%[b_ptr], #80]\n" - - "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "ldr %q[a0], [%[a_ptr], #64]\n" - "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "ldr %q[a1], [%[a_ptr], #80]\n" - "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "ldr %q[b0], [%[b_ptr], #96]\n" - - "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - ASM_PREFETCH("[%[b_ptr], #512]") - "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "ldr %q[b1], [%[b_ptr], #112]\n" - - "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "subs %w[k], %w[k], #1\n" - "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "bne 1b\n" - - // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) - "4:\n" - - // Branch to alternative tail for odd K - "cbnz %w[oddk], 2f\n" - - // Detached final iteration (even K) - "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %q[a0a], [%[a_ptr], #32]\n" - "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr %q[a1a], [%[a_ptr], #48]\n" - "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %q[b0], [%[b_ptr], #48]\n" - - "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "ldr %q[b1], [%[b_ptr], #64]\n" - - "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "ldr %q[b2], [%[b_ptr], #80]\n" - - "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - - "add %[b_ptr], %[b_ptr], %[block_jump]\n" - "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "str q8, [%[c_ptr], #0]\n" - "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - "str q16, [%[c_ptr], #16]\n" - "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "str q24, [%[c_ptr], #32]\n" - - "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "str q17, [%[c_ptr], #64]\n" - "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "str q25, [%[c_ptr], #80]\n" - "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "str q18, [%[c_ptr], #112]\n" - "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "str q26, [%[c_ptr], #128]\n" - "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "str q19, [%[c_ptr], #160]\n" - "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "str q27, [%[c_ptr], #176]\n" - "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "str q20, [%[c_ptr], #208]\n" - "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "str q28, [%[c_ptr], #224]\n" - "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "str q21, [%[c_ptr], #256]\n" - "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "str q29, [%[c_ptr], #272]\n" - "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "str q22, [%[c_ptr], #304]\n" - "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "str q30, [%[c_ptr], #320]\n" - "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - "b 3f\n" - - // Detached final iteration (odd K) - "2:\n" - "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "str q8, [%[c_ptr], #0]\n" - "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "str q16, [%[c_ptr], #16]\n" - "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "add %[a_ptr], %[a_ptr], #32\n" - "str q24, [%[c_ptr], #32]\n" - "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - - "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "str q17, [%[c_ptr], #64]\n" - "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "str q25, [%[c_ptr], #80]\n" - "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "str q18, [%[c_ptr], #112]\n" - "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "str q26, [%[c_ptr], #128]\n" - "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "str q19, [%[c_ptr], #160]\n" - "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "str q27, [%[c_ptr], #176]\n" - "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "str q20, [%[c_ptr], #208]\n" - "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "str q28, [%[c_ptr], #224]\n" - "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "str q21, [%[c_ptr], #256]\n" - "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "str q29, [%[c_ptr], #272]\n" - "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "str q22, [%[c_ptr], #304]\n" - "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "str q30, [%[c_ptr], #320]\n" - "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - - // Common tail - "3:\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - - ".purgem sdot\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" - ); - } - } - - -} - - -#endif diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp deleted file mode 100644 index 1588f049f4..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Load the actual kernel -#include "a64_gemm_s8_4x4/generic.hpp" - -class gemm_s8_4x4 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 4; - static const int A_block = 16; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 4; - static const int B_block = 16; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static const int out_width = 4; - static const int out_height = 4; - static const int k_unroll = 16; - - kern_type kernel = nullptr; - - gemm_s8_4x4(const CPUInfo *ci) { - kernel = a64_gemm_s8_4x4; - } -}; - -#endif // __aarch64__ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp deleted file mode 100644 index 0ec435b33b..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp +++ /dev/null @@ -1,465 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> - -inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - K /= 16; - int oddk = (K & 1); - - for (int yb=0; yb<ablocks; yb++) { - const int8_t *a_ptr0 = a_ptr; - const int8_t *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - - int k = ((K+1)/2)-1; - - register int8x16_t b0 asm("v4"); - register int8x16_t b1 asm("v5"); - register int8x16_t b2 asm("v6"); - register int8x16_t b3 asm("v7"); - register int8x16_t b0a asm("v8"); - register int8x16_t b1a asm("v9"); - register int8x16_t b2a asm("v10"); - register int8x16_t b3a asm("v11"); - - __asm __volatile ( - "movi v16.4s, #0x0\n" - "ldr q0, [%[a_ptr]]\n" - "movi v17.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v18.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v19.4s, #0x0\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "movi v20.4s, #0x0\n" - "ldr %q[b3], [%[b_ptr], #48]\n" - "movi v21.4s, #0x0\n" - "ldr q1, [%[a_ptr], #16]\n" - "movi v22.4s, #0x0\n" - "ldr q2, [%[a_ptr], #32]\n" - "movi v23.4s, #0x0\n" - "ldr q3, [%[a_ptr], #48]\n" - "movi v24.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v25.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v26.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v27.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v28.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v29.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v30.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v31.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - - // Loop structure optimized for A57 (after r0). - - // Unavoidably, the multiply will "dribble" if - // dual issued with an add. - - // Minimize the effect of this by making sure - // there are 2 adds to run under the dribbled - // multiply. - - // Pipeline in blocks of 8 multiplies - combine - // this iteration's multiplies with adds from - // the previous iteration. - - // So the first block doesn't have any adds to - // do - but because all the adds are at the - // start of the block it's only the first couple - // of multiplies that need to be pulled out. - - // Start of unroll 0 (first iteration) - "smull v12.8h, v0.8b, %[b0].8b\n" - "smull v13.8h, v0.8b, %[b1].8b\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - // Unroll 0 continuation (branch target) - "1:\n" - "smull v14.8h, v0.8b, %[b2].8b\n" - "subs %w[k], %w[k], #1\n" - "smull v15.8h, v0.8b, %[b3].8b\n" - "ldr %q[b0a], [%[b_ptr], #64]\n" - "smlal2 v12.8h, v0.16b, %[b0].16b\n" - "smlal2 v13.8h, v0.16b, %[b1].16b\n" - "ldr %q[b1a], [%[b_ptr], #80]\n" - "smlal2 v14.8h, v0.16b, %[b2].16b\n" - "smlal2 v15.8h, v0.16b, %[b3].16b\n" - "ldr q0, [%[a_ptr], #64]\n" - - "sadalp v16.4s, v12.8h\n" - "smull v12.8h, v1.8b, %[b0].8b\n" - "sadalp v17.4s, v13.8h\n" - "sadalp v18.4s, v14.8h\n" - "smull v13.8h, v1.8b, %[b1].8b\n" - "sadalp v19.4s, v15.8h\n" - "smull v14.8h, v1.8b, %[b2].8b\n" - "ldr %q[b2a], [%[b_ptr], #96]\n" - "smull v15.8h, v1.8b, %[b3].8b\n" - "smlal2 v12.8h, v1.16b, %[b0].16b\n" - "ldr %q[b3a], [%[b_ptr], #112]\n" - "smlal2 v13.8h, v1.16b, %[b1].16b\n" - "add %[b_ptr], %[b_ptr], #128\n" - "smlal2 v14.8h, v1.16b, %[b2].16b\n" - "smlal2 v15.8h, v1.16b, %[b3].16b\n" - "ldr q1, [%[a_ptr], #80]\n" - - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v2.8b, %[b0].8b\n" - "sadalp v21.4s, v13.8h\n" - "sadalp v22.4s, v14.8h\n" - "smull v13.8h, v2.8b, %[b1].8b\n" - "sadalp v23.4s, v15.8h\n" - "smull v14.8h, v2.8b, %[b2].8b\n" - "smull v15.8h, v2.8b, %[b3].8b\n" - "smlal2 v12.8h, v2.16b, %[b0].16b\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "smlal2 v13.8h, v2.16b, %[b1].16b\n" - "smlal2 v14.8h, v2.16b, %[b2].16b\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "smlal2 v15.8h, v2.16b, %[b3].16b\n" - "ldr q2, [%[a_ptr], #96]\n" - - "sadalp v24.4s, v12.8h\n" - "smull v12.8h, v3.8b, %[b0].8b\n" - "sadalp v25.4s, v13.8h\n" - "sadalp v26.4s, v14.8h\n" - "smull v13.8h, v3.8b, %[b1].8b\n" - "sadalp v27.4s, v15.8h\n" - "smull v14.8h, v3.8b, %[b2].8b\n" - "smull v15.8h, v3.8b, %[b3].8b\n" - "smlal2 v12.8h, v3.16b, %[b0].16b\n" - "ldr %q[b0], [%[b_ptr], #0]\n" - "smlal2 v13.8h, v3.16b, %[b1].16b\n" - "smlal2 v14.8h, v3.16b, %[b2].16b\n" - "smlal2 v15.8h, v3.16b, %[b3].16b\n" - "ldr q3, [%[a_ptr], #112]\n" - - // Unroll 1 - "sadalp v28.4s, v12.8h\n" - "smull v12.8h, v0.8b, %[b0a].8b\n" - "sadalp v29.4s, v13.8h\n" - "sadalp v30.4s, v14.8h\n" - "smull v13.8h, v0.8b, %[b1a].8b\n" - "sadalp v31.4s, v15.8h\n" - "smull v14.8h, v0.8b, %[b2a].8b\n" - "smull v15.8h, v0.8b, %[b3a].8b\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "smlal2 v12.8h, v0.16b, %[b0a].16b\n" - "smlal2 v13.8h, v0.16b, %[b1a].16b\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "smlal2 v14.8h, v0.16b, %[b2a].16b\n" - "smlal2 v15.8h, v0.16b, %[b3a].16b\n" - "ldr q0, [%[a_ptr], #128]\n" - - "sadalp v16.4s, v12.8h\n" - "smull v12.8h, v1.8b, %[b0a].8b\n" - "sadalp v17.4s, v13.8h\n" - "sadalp v18.4s, v14.8h\n" - "smull v13.8h, v1.8b, %[b1a].8b\n" - "sadalp v19.4s, v15.8h\n" - "add %[a_ptr], %[a_ptr], #128\n" - "smull v14.8h, v1.8b, %[b2a].8b\n" - "smull v15.8h, v1.8b, %[b3a].8b\n" - "ldr %q[b3], [%[b_ptr], #48]\n" - "smlal2 v12.8h, v1.16b, %[b0a].16b\n" - "smlal2 v13.8h, v1.16b, %[b1a].16b\n" - "smlal2 v14.8h, v1.16b, %[b2a].16b\n" - "smlal2 v15.8h, v1.16b, %[b3a].16b\n" - "ldr q1, [%[a_ptr], #16]\n" - - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v2.8b, %[b0a].8b\n" - "sadalp v21.4s, v13.8h\n" - "sadalp v22.4s, v14.8h\n" - "smull v13.8h, v2.8b, %[b1a].8b\n" - "sadalp v23.4s, v15.8h\n" - "smull v14.8h, v2.8b, %[b2a].8b\n" - "smull v15.8h, v2.8b, %[b3a].8b\n" - "smlal2 v12.8h, v2.16b, %[b0a].16b\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "smlal2 v13.8h, v2.16b, %[b1a].16b\n" - "smlal2 v14.8h, v2.16b, %[b2a].16b\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "smlal2 v15.8h, v2.16b, %[b3a].16b\n" - "ldr q2, [%[a_ptr], #32]\n" - - "sadalp v24.4s, v12.8h\n" - "smull v12.8h, v3.8b, %[b0a].8b\n" - "sadalp v25.4s, v13.8h\n" - "sadalp v26.4s, v14.8h\n" - "smull v13.8h, v3.8b, %[b1a].8b\n" - "sadalp v27.4s, v15.8h\n" - "smull v14.8h, v3.8b, %[b2a].8b\n" - "smull v15.8h, v3.8b, %[b3a].8b\n" - "smlal2 v12.8h, v3.16b, %[b0a].16b\n" - "smlal2 v13.8h, v3.16b, %[b1a].16b\n" - "smlal2 v14.8h, v3.16b, %[b2a].16b\n" - "smlal2 v15.8h, v3.16b, %[b3a].16b\n" - "ldr q3, [%[a_ptr], #48]\n" - - // Start of unroll 0 for next iteration. - "sadalp v28.4s, v12.8h\n" - "smull v12.8h, v0.8b, %[b0].8b\n" - "sadalp v29.4s, v13.8h\n" - "sadalp v30.4s, v14.8h\n" - "smull v13.8h, v0.8b, %[b1].8b\n" - "sadalp v31.4s, v15.8h\n" - "bne 1b\n" - - // Target to use when K=1 or 2 (i.e. zero iterations of main loop) - "4:\n" - - // Branch to alternative tail for odd K - "cbnz %w[oddk], 2f\n" - - // Detached final iteration (even K) - "smull v14.8h, v0.8b, %[b2].8b\n" - "smull v15.8h, v0.8b, %[b3].8b\n" - "ldr %q[b0a], [%[b_ptr], #64]\n" - "smlal2 v12.8h, v0.16b, %[b0].16b\n" - "smlal2 v13.8h, v0.16b, %[b1].16b\n" - "ldr %q[b1a], [%[b_ptr], #80]\n" - "smlal2 v14.8h, v0.16b, %[b2].16b\n" - "smlal2 v15.8h, v0.16b, %[b3].16b\n" - "ldr q0, [%[a_ptr], #64]\n" - - "sadalp v16.4s, v12.8h\n" - "smull v12.8h, v1.8b, %[b0].8b\n" - "sadalp v17.4s, v13.8h\n" - "sadalp v18.4s, v14.8h\n" - "smull v13.8h, v1.8b, %[b1].8b\n" - "sadalp v19.4s, v15.8h\n" - "smull v14.8h, v1.8b, %[b2].8b\n" - "ldr %q[b2a], [%[b_ptr], #96]\n" - "smull v15.8h, v1.8b, %[b3].8b\n" - "smlal2 v12.8h, v1.16b, %[b0].16b\n" - "ldr %q[b3a], [%[b_ptr], #112]\n" - "smlal2 v13.8h, v1.16b, %[b1].16b\n" - "add %[b_ptr], %[b_ptr], #128\n" - "smlal2 v14.8h, v1.16b, %[b2].16b\n" - "smlal2 v15.8h, v1.16b, %[b3].16b\n" - "ldr q1, [%[a_ptr], #80]\n" - - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v2.8b, %[b0].8b\n" - "sadalp v21.4s, v13.8h\n" - "sadalp v22.4s, v14.8h\n" - "smull v13.8h, v2.8b, %[b1].8b\n" - "sadalp v23.4s, v15.8h\n" - "smull v14.8h, v2.8b, %[b2].8b\n" - "smull v15.8h, v2.8b, %[b3].8b\n" - "smlal2 v12.8h, v2.16b, %[b0].16b\n" - "smlal2 v13.8h, v2.16b, %[b1].16b\n" - "smlal2 v14.8h, v2.16b, %[b2].16b\n" - "smlal2 v15.8h, v2.16b, %[b3].16b\n" - "ldr q2, [%[a_ptr], #96]\n" - - "sadalp v24.4s, v12.8h\n" - "smull v12.8h, v3.8b, %[b0].8b\n" - "sadalp v25.4s, v13.8h\n" - "sadalp v26.4s, v14.8h\n" - "smull v13.8h, v3.8b, %[b1].8b\n" - "sadalp v27.4s, v15.8h\n" - "smull v14.8h, v3.8b, %[b2].8b\n" - "smull v15.8h, v3.8b, %[b3].8b\n" - "smlal2 v12.8h, v3.16b, %[b0].16b\n" - "smlal2 v13.8h, v3.16b, %[b1].16b\n" - "smlal2 v14.8h, v3.16b, %[b2].16b\n" - "smlal2 v15.8h, v3.16b, %[b3].16b\n" - "ldr q3, [%[a_ptr], #112]\n" - - // Unroll 1 - "sadalp v28.4s, v12.8h\n" - "smull v12.8h, v0.8b, %[b0a].8b\n" - "sadalp v29.4s, v13.8h\n" - "sadalp v30.4s, v14.8h\n" - "smull v13.8h, v0.8b, %[b1a].8b\n" - "sadalp v31.4s, v15.8h\n" - "smull v14.8h, v0.8b, %[b2a].8b\n" - "add %[a_ptr], %[a_ptr], #128\n" - "smull v15.8h, v0.8b, %[b3a].8b\n" - "smlal2 v12.8h, v0.16b, %[b0a].16b\n" - "smlal2 v13.8h, v0.16b, %[b1a].16b\n" - "smlal2 v14.8h, v0.16b, %[b2a].16b\n" - "smlal2 v15.8h, v0.16b, %[b3a].16b\n" - - "sadalp v16.4s, v12.8h\n" - "smull v12.8h, v1.8b, %[b0a].8b\n" - "sadalp v17.4s, v13.8h\n" - "sadalp v18.4s, v14.8h\n" - "smull v13.8h, v1.8b, %[b1a].8b\n" - "sadalp v19.4s, v15.8h\n" - "smull v14.8h, v1.8b, %[b2a].8b\n" - "smull v15.8h, v1.8b, %[b3a].8b\n" - "smlal2 v12.8h, v1.16b, %[b0a].16b\n" - "addp v16.4s, v16.4s, v17.4s\n" - "smlal2 v13.8h, v1.16b, %[b1a].16b\n" - "addp v17.4s, v18.4s, v19.4s\n" - "smlal2 v14.8h, v1.16b, %[b2a].16b\n" - "smlal2 v15.8h, v1.16b, %[b3a].16b\n" - - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v2.8b, %[b0a].8b\n" - "sadalp v21.4s, v13.8h\n" - "sadalp v22.4s, v14.8h\n" - "smull v13.8h, v2.8b, %[b1a].8b\n" - "sadalp v23.4s, v15.8h\n" - "addp v16.4s, v16.4s, v17.4s\n" - "smull v14.8h, v2.8b, %[b2a].8b\n" - "addp v18.4s, v20.4s, v21.4s\n" - "addp v19.4s, v22.4s, v23.4s\n" - "smull v15.8h, v2.8b, %[b3a].8b\n" - "smlal2 v12.8h, v2.16b, %[b0a].16b\n" - "str q16, [%[c_ptr]]\n" - "smlal2 v13.8h, v2.16b, %[b1a].16b\n" - "smlal2 v14.8h, v2.16b, %[b2a].16b\n" - "smlal2 v15.8h, v2.16b, %[b3a].16b\n" - - "sadalp v24.4s, v12.8h\n" - "smull v12.8h, v3.8b, %[b0a].8b\n" - "sadalp v25.4s, v13.8h\n" - "sadalp v26.4s, v14.8h\n" - "smull v13.8h, v3.8b, %[b1a].8b\n" - "sadalp v27.4s, v15.8h\n" - "addp v17.4s, v18.4s, v19.4s\n" - "smull v14.8h, v3.8b, %[b2a].8b\n" - "addp v20.4s, v24.4s, v25.4s\n" - "addp v21.4s, v26.4s, v27.4s\n" - "smull v15.8h, v3.8b, %[b3a].8b\n" - "smlal2 v12.8h, v3.16b, %[b0a].16b\n" - "str q17, [%[c_ptr], #16]\n" - "smlal2 v13.8h, v3.16b, %[b1a].16b\n" - "smlal2 v14.8h, v3.16b, %[b2a].16b\n" - "addp v18.4s, v20.4s, v21.4s\n" - "smlal2 v15.8h, v3.16b, %[b3a].16b\n" - "b 3f\n" - - // Detached final iteration (odd K) - "2:\n" - "smull v14.8h, v0.8b, %[b2].8b\n" - "add %[a_ptr], %[a_ptr], #64\n" - "smull v15.8h, v0.8b, %[b3].8b\n" - "add %[b_ptr], %[b_ptr], #64\n" - "smlal2 v12.8h, v0.16b, %[b0].16b\n" - "smlal2 v13.8h, v0.16b, %[b1].16b\n" - "smlal2 v14.8h, v0.16b, %[b2].16b\n" - "smlal2 v15.8h, v0.16b, %[b3].16b\n" - - "sadalp v16.4s, v12.8h\n" - "smull v12.8h, v1.8b, %[b0].8b\n" - "sadalp v17.4s, v13.8h\n" - "sadalp v18.4s, v14.8h\n" - "smull v13.8h, v1.8b, %[b1].8b\n" - "sadalp v19.4s, v15.8h\n" - "smull v14.8h, v1.8b, %[b2].8b\n" - "smull v15.8h, v1.8b, %[b3].8b\n" - "smlal2 v12.8h, v1.16b, %[b0].16b\n" - "addp v16.4s, v16.4s, v17.4s\n" - "smlal2 v13.8h, v1.16b, %[b1].16b\n" - "addp v17.4s, v18.4s, v19.4s\n" - "smlal2 v14.8h, v1.16b, %[b2].16b\n" - "smlal2 v15.8h, v1.16b, %[b3].16b\n" - - "sadalp v20.4s, v12.8h\n" - "smull v12.8h, v2.8b, %[b0].8b\n" - "sadalp v21.4s, v13.8h\n" - "sadalp v22.4s, v14.8h\n" - "smull v13.8h, v2.8b, %[b1].8b\n" - "sadalp v23.4s, v15.8h\n" - "addp v16.4s, v16.4s, v17.4s\n" - "smull v14.8h, v2.8b, %[b2].8b\n" - "addp v18.4s, v20.4s, v21.4s\n" - "addp v19.4s, v22.4s, v23.4s\n" - "smull v15.8h, v2.8b, %[b3].8b\n" - "smlal2 v12.8h, v2.16b, %[b0].16b\n" - "str q16, [%[c_ptr]]\n" - "smlal2 v13.8h, v2.16b, %[b1].16b\n" - "smlal2 v14.8h, v2.16b, %[b2].16b\n" - "smlal2 v15.8h, v2.16b, %[b3].16b\n" - - "sadalp v24.4s, v12.8h\n" - "smull v12.8h, v3.8b, %[b0].8b\n" - "sadalp v25.4s, v13.8h\n" - "sadalp v26.4s, v14.8h\n" - "smull v13.8h, v3.8b, %[b1].8b\n" - "sadalp v27.4s, v15.8h\n" - "addp v17.4s, v18.4s, v19.4s\n" - "smull v14.8h, v3.8b, %[b2].8b\n" - "addp v20.4s, v24.4s, v25.4s\n" - "addp v21.4s, v26.4s, v27.4s\n" - "smull v15.8h, v3.8b, %[b3].8b\n" - "smlal2 v12.8h, v3.16b, %[b0].16b\n" - "str q17, [%[c_ptr], #16]\n" - "smlal2 v13.8h, v3.16b, %[b1].16b\n" - "smlal2 v14.8h, v3.16b, %[b2].16b\n" - "addp v18.4s, v20.4s, v21.4s\n" - "smlal2 v15.8h, v3.16b, %[b3].16b\n" - - "3:\n" - - // Final additions - "sadalp v28.4s, v12.8h\n" - "str q18, [%[c_ptr], #32]\n" - "sadalp v29.4s, v13.8h\n" - "sadalp v30.4s, v14.8h\n" - "sadalp v31.4s, v15.8h\n" - - // Horizontal reduction, phase 1 - "addp v22.4s, v28.4s, v29.4s\n" - "addp v23.4s, v30.4s, v31.4s\n" - - // Horizontal reduction, phase 2 - "addp v19.4s, v22.4s, v23.4s\n" - "str q19, [%[c_ptr], #48]\n" - "add %[c_ptr], %[c_ptr], #64\n" - - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3), - [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a), - [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19", - "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc"); - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp deleted file mode 100644 index 7eb8b2dacf..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Actual kernel implementations -#include "a64_gemm_u16_12x8/generic.hpp" - -// 12x8 SGEMM "strategy" class. -// -// This describes the characteristics of a family of kernels, in terms of -// the required interleave properties and the output block size. -// -// All kernels in the family must share these characteristics. The actual -// kernel to be used can be chosen at runtime, based on the CPU_type -// structure. -class gemm_u16_12x8 { -public: - typedef uint16_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 1; - static const int A_transpose = 0; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 1; - static const int B_transpose = 1; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 1; - - kern_type kernel = nullptr; - - gemm_u16_12x8(const CPUInfo *ci) { - kernel = a64_gemm_u16_asimd_12x8; - } -}; - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp deleted file mode 100644 index b3f310ce62..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once -#include <arm_neon.h> - -inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) -{ - const uint16_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - for (int yb = 0; yb < ablocks; yb++) - { - const uint16_t *a_ptr0 = a_ptr; - const uint16_t *b_ptr = Bpanel; - - for (int xb = 0; xb < bblocks; xb++) - { - a_ptr = a_ptr0; - const bool odd_k = K & 0x1; - int k = (K+1)/2 - 1; - - register uint16x8_t aa asm("v0"); - register uint16x8_t ab asm("v1"); - register uint16x8_t b0 asm("v2"); - register uint16x8_t b1 asm("v3"); - register uint16x8_t b2 asm("v4"); - - __asm __volatile ( - "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower - "movi v5.4s, #0\n" - "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper - "movi v6.4s, #0\n" - "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper - "movi v7.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v8.4s, #0\n" - "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper - "movi v9.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v10.4s, #0\n" - "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper - "movi v11.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #96]") - "movi v12.4s, #0\n" - "movi v13.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #96]") - "movi v14.4s, #0\n" - "movi v15.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0\n" - "movi v17.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v18.4s, #0\n" - "movi v19.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #160]") - "movi v20.4s, #0\n" - "movi v21.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #160]") - "movi v22.4s, #0\n" - "movi v23.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v24.4s, #0\n" - "add %x[a_ptr], %x[a_ptr], #0x10\n" - "movi v25.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v26.4s, #0\n" - "add %x[b_ptr], %x[b_ptr], #0x18\n" - "movi v27.4s, #0\n" - "movi v28.4s, #0\n" - - "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. - - "1:\n" // Main loop - // First unroll - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - // Second unroll - "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper - "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper - "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "add %x[a_ptr], %x[a_ptr], #0x20\n" - "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "subs %x[k], %x[k], #0x1\n" - "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper - "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "add %x[b_ptr], %x[b_ptr], #0x30\n" - "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "bne 1b\n" - - "2:\n" // Even tail - "cbnz %x[odd_k], 3f\n" - - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "add %[a_ptr], %[a_ptr], #0x10\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "add %[b_ptr], %[b_ptr], #0x18\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" - "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "str q19, [%x[c_ptr], #0x130]\n" - "b 4f\n" // Complete write out - - "3:\n" // Odd tail - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - - "4:\n" // End of function - "str q19, [%x[c_ptr], #0x130]\n" - "str q27, [%x[c_ptr], #0x140]\n" - "str q12, [%x[c_ptr], #0x150]\n" - "str q20, [%x[c_ptr], #0x160]\n" - "str q28, [%x[c_ptr], #0x170]\n" - "add %x[c_ptr], %x[c_ptr], #0x180\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), - [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) - : [odd_k] "r" (odd_k) - : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp deleted file mode 100644 index 62cd747d7c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Load the actual kernel -#include "a64_gemm_u8_12x8/generic.hpp" -#include "a64_gemm_u8_12x8/a55r1.hpp" - -class gemm_u8_12x8 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 4; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 4; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 4; - - kern_type kernel = nullptr; - - gemm_u8_12x8(const CPUInfo *ci) { - kernel = a64_gemm_u8_12x8; - if (ci->CPU == CPUTarget::A55_DOT) { - kernel = a64_gemm_u8_12x8_a55r1; - } - } -}; - -#endif // __aarch64__ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp deleted file mode 100644 index c7c2acbb49..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "dot_toolchain_support.h" -#include <cassert> - -inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb<ablocks; yb++) { - const uint8_t *a_ptr0 = a_ptr; - const uint8_t *b_ptr = Bpanel; - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - int k = init_value_k; - register int32x4_t a0 asm("v0"); - register int32x4_t a1 asm("v1"); - register int32x4_t b0 asm("v2"); - register int32x4_t b1 asm("v3"); - register int32x4_t b2 asm("v4"); - register int32x4_t a0a asm("v5"); - register int32x4_t a1a asm("v6"); - __asm __volatile ( - _DECLARE_UDOT - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldp %q[a0], %q[a1], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldp %q[b0], %q[b1], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - - // Loop proper - "1:\n" - "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %d[a0a], [%[a_ptr], #32]\n" - - "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "ins %[b2].d[1], x20\n" - "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %d[a1a], [%[a_ptr], #48]\n" - - - "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "ins %[a0a].d[1], x20\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "ins %[a1a].d[1], x20\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "ins %[b0].d[1], x20\n" - "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" - - "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" - - "ldr %d[b2], [%[b_ptr], #80]\n" - - "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - "ins %[b1].d[1], x20\n" - "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "ldr %d[a0], [%[a_ptr], #64]\n" - - "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "ins %[b2].d[1], x20\n" - "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "ldr x20, [%[a_ptr], #72]\n" - "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "ldr %d[a1], [%[a_ptr], #80]\n" - - "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "ins %[a0].d[1], x20\n" - ASM_PREFETCH("[%[b_ptr], #512]") - "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - "ldr x20, [%[a_ptr], #88]\n" - "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "ldr %d[b0], [%[b_ptr], #96]\n" - - "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "ins %[a1].d[1], x20\n" - "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "ldr x20, [%[b_ptr], #104]\n" - "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "ldr %d[b1], [%[b_ptr], #112]\n" - - "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "ins %[b0].d[1], x20\n" - "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "ldr x20, [%[b_ptr], #120]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - - "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "subs %w[k], %w[k], #1\n" - "ins %[b1].d[1], x20\n" - "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "bne 1b\n" - - // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) - "4:\n" - - // Branch to alternative tail for odd K - "cbnz %w[oddk], 2f\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - // Detached final iteration (even K) - "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %d[a0a], [%[a_ptr], #32]\n" - - "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "ins %[b2].d[1], x20\n" - - "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %d[a1a], [%[a_ptr], #48]\n" - - "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "ins %[a0a].d[1], x20\n" - "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "ins %[a1a].d[1], x20\n" - "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" - - "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "ins %[b0].d[1], x20\n" - "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "ldr %d[b2], [%[b_ptr], #80]\n" - - "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - "ins %[b1].d[1], x20\n" - "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "ins %[b2].d[1], x20\n" - - "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "str q8, [%[c_ptr], #0]\n" - "str q16, [%[c_ptr], #16]\n" - "str q24, [%[c_ptr], #32]\n" - "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - - "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - "str q17, [%[c_ptr], #64]\n" - "str q25, [%[c_ptr], #80]\n" - "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - "str q18, [%[c_ptr], #112]\n" - "str q26, [%[c_ptr], #128]\n" - "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - "str q19, [%[c_ptr], #160]\n" - "str q27, [%[c_ptr], #176]\n" - "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - "str q20, [%[c_ptr], #208]\n" - "str q28, [%[c_ptr], #224]\n" - "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - "str q21, [%[c_ptr], #256]\n" - "str q29, [%[c_ptr], #272]\n" - "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - "str q22, [%[c_ptr], #304]\n" - "str q30, [%[c_ptr], #320]\n" - "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - "b 3f\n" - - // Detached final iteration (odd K) - "2:\n" - "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "ldr x20, [%[b_ptr], #40]\n" - - "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "str q8, [%[c_ptr], #0]\n" - "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "str q16, [%[c_ptr], #16]\n" - "ins %[b2].d[1], x20\n" - "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "add %[a_ptr], %[a_ptr], #32\n" - "str q24, [%[c_ptr], #32]\n" - "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - - "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "str q17, [%[c_ptr], #64]\n" - "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "str q25, [%[c_ptr], #80]\n" - "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "str q18, [%[c_ptr], #112]\n" - "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "str q26, [%[c_ptr], #128]\n" - "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "str q19, [%[c_ptr], #160]\n" - "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "str q27, [%[c_ptr], #176]\n" - "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "str q20, [%[c_ptr], #208]\n" - "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "str q28, [%[c_ptr], #224]\n" - "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "str q21, [%[c_ptr], #256]\n" - "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "str q29, [%[c_ptr], #272]\n" - "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "str q22, [%[c_ptr], #304]\n" - "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "str q30, [%[c_ptr], #320]\n" - "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - - // Common tail - "3:\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - - - - ".purgem udot\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - } - } -} -#endif - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h deleted file mode 100644 index 718232fb05..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -// Define a macro to assemble the UDOT instruction (in the absence of toolchain support) -#define _DECLARE_UDOT ".altmacro\n"\ - ".macro udot opd:req, opn:req, opm:req\n"\ - "local vd, vn, vm, h, l\n"\ - ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\ - ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\ - ".set vd,\\reg\n"\ - ".endif\n"\ - ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\ - ".set vn,\\reg\n"\ - ".endif\n"\ - ".irp idx,0,1,2,3\n"\ - ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\ - ".set vm,\\reg\n"\ - ".set h,\\idx / 2\n"\ - ".set l,\\idx %% 2\n"\ - ".endif\n"\ - ".endr\n"\ - ".endr\n"\ - ".ifndef vd\n"\ - ".error \"Bad operand \\opd\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef vn\n"\ - ".error \"Bad operand \\opn\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef vm\n"\ - ".error \"Bad operand \\opm\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef h\n"\ - ".error \"Bad operand \\opm\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".ifndef l\n"\ - ".error \"Bad operand \\opm\"\n"\ - ".exitm\n"\ - ".endif\n"\ - ".int 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\ - ".endm\n"\ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp deleted file mode 100644 index 3531eb6d25..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "dot_toolchain_support.h" -#include <cassert> - -inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb<ablocks; yb++) { - const uint8_t *a_ptr0 = a_ptr; - const uint8_t *b_ptr = Bpanel; - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - int k = init_value_k; - register uint32x4_t a0 asm("v0"); - register uint32x4_t a1 asm("v1"); - register uint32x4_t b0 asm("v2"); - register uint32x4_t b1 asm("v3"); - register uint32x4_t b2 asm("v4"); - register uint32x4_t a0a asm("v5"); - register uint32x4_t a1a asm("v6"); - __asm __volatile ( - _DECLARE_UDOT - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldr %q[a0], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "ldr %q[a1], [%[a_ptr], #16]\n" - "movi v11.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - // Loop proper - "1:\n" - "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" - - "ldr %q[b2], [%[b_ptr], #32]\n" - "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %q[a0a], [%[a_ptr], #32]\n" - "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr %q[a1a], [%[a_ptr], #48]\n" - "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %q[b0], [%[b_ptr], #48]\n" - - "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "ldr %q[b1], [%[b_ptr], #64]\n" - - "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "ldr %q[b2], [%[b_ptr], #80]\n" - - "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "ldr %q[a0], [%[a_ptr], #64]\n" - "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "ldr %q[a1], [%[a_ptr], #80]\n" - "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "ldr %q[b0], [%[b_ptr], #96]\n" - - "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - ASM_PREFETCH("[%[b_ptr], #512]") - "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "ldr %q[b1], [%[b_ptr], #112]\n" - - "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "subs %w[k], %w[k], #1\n" - "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "bne 1b\n" - - // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) - "4:\n" - - // Branch to alternative tail for odd K - "cbnz %w[oddk], 2f\n" - - // Detached final iteration (even K) - "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "ldr %q[a0a], [%[a_ptr], #32]\n" - "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "ldr %q[a1a], [%[a_ptr], #48]\n" - "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "ldr %q[b0], [%[b_ptr], #48]\n" - - "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "ldr %q[b1], [%[b_ptr], #64]\n" - - "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "ldr %q[b2], [%[b_ptr], #80]\n" - - "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" - - "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" - "str q8, [%[c_ptr], #0]\n" - "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" - "str q16, [%[c_ptr], #16]\n" - "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" - "str q24, [%[c_ptr], #32]\n" - - "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" - "str q17, [%[c_ptr], #64]\n" - "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" - "str q25, [%[c_ptr], #80]\n" - "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" - "str q18, [%[c_ptr], #112]\n" - "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" - "str q26, [%[c_ptr], #128]\n" - "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" - "str q19, [%[c_ptr], #160]\n" - "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" - "str q27, [%[c_ptr], #176]\n" - "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" - "str q20, [%[c_ptr], #208]\n" - "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" - "str q28, [%[c_ptr], #224]\n" - "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" - "str q21, [%[c_ptr], #256]\n" - "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" - "str q29, [%[c_ptr], #272]\n" - "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" - "str q22, [%[c_ptr], #304]\n" - "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" - "str q30, [%[c_ptr], #320]\n" - "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - "b 3f\n" - - // Detached final iteration (odd K) - "2:\n" - "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" - "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" - "str q8, [%[c_ptr], #0]\n" - "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" - "str q16, [%[c_ptr], #16]\n" - "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "add %[a_ptr], %[a_ptr], #32\n" - "str q24, [%[c_ptr], #32]\n" - "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" - "str q9, [%[c_ptr], #48]\n" - - "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" - "str q17, [%[c_ptr], #64]\n" - "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" - "str q25, [%[c_ptr], #80]\n" - "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" - "str q18, [%[c_ptr], #112]\n" - "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" - "str q26, [%[c_ptr], #128]\n" - "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" - "str q19, [%[c_ptr], #160]\n" - "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" - "str q27, [%[c_ptr], #176]\n" - "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" - "str q20, [%[c_ptr], #208]\n" - "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" - "str q28, [%[c_ptr], #224]\n" - "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" - "str q21, [%[c_ptr], #256]\n" - "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" - "str q29, [%[c_ptr], #272]\n" - "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" - "str q22, [%[c_ptr], #304]\n" - "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" - "str q30, [%[c_ptr], #320]\n" - "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" - "str q15, [%[c_ptr], #336]\n" - - - // Common tail - "3:\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - - ".purgem udot\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" - ); - - } - } - - -} -#endif diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp deleted file mode 100644 index 3561bfec96..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Load the actual kernel -#include "a64_gemm_u8_4x4/generic.hpp" - -class gemm_u8_4x4 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 4; - static const int A_block = 16; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 4; - static const int B_block = 16; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static const int out_width = 4; - static const int out_height = 4; - static const int k_unroll = 16; - - kern_type kernel = nullptr; - - gemm_u8_4x4(const CPUInfo *ci) { - kernel = a64_gemm_u8_4x4; - } -}; - -#endif // __aarch64__ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp deleted file mode 100644 index aff3faf666..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> - -inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - K /= 16; - - for (int yb=0; yb<ablocks; yb++) { - const uint8_t *a_ptr0 = a_ptr; - const uint8_t *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - - int k = K-1; - - register uint8x16_t b0 asm("v4"); - register uint8x16_t b1 asm("v5"); - register uint8x16_t b2 asm("v6"); - register uint8x16_t b3 asm("v7"); - - __asm __volatile ( - "movi v16.4s, #0x0\n" - "ldr q0, [%[a_ptr]]\n" - "movi v17.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v18.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v19.4s, #0x0\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "movi v20.4s, #0x0\n" - "ldr %q[b3], [%[b_ptr], #48]\n" - "movi v21.4s, #0x0\n" - "ldr q1, [%[a_ptr], #16]\n" - "movi v22.4s, #0x0\n" - "ldr q2, [%[a_ptr], #32]\n" - "movi v23.4s, #0x0\n" - "ldr q3, [%[a_ptr], #48]\n" - "movi v24.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v25.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v26.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v27.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v28.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v29.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v30.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v31.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - - "umull v12.8h, v0.8b, %[b0].8b\n" - "add %[a_ptr], %[a_ptr], #64\n" - "umull v13.8h, v0.8b, %[b1].8b\n" - "umull v14.8h, v0.8b, %[b2].8b\n" - "add %[b_ptr], %[b_ptr], #64\n" - "umull v15.8h, v0.8b, %[b3].8b\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 2f\n" - - "1:\n" - "uadalp v16.4s, v12.8h\n" - "umull2 v12.8h, v0.16b, %[b0].16b\n" - "uadalp v17.4s, v13.8h\n" - "umull2 v13.8h, v0.16b, %[b1].16b\n" - "uadalp v18.4s, v14.8h\n" - "umull2 v14.8h, v0.16b, %[b2].16b\n" - "uadalp v19.4s, v15.8h\n" - "umull2 v15.8h, v0.16b, %[b3].16b\n" - "ldr q0, [%[a_ptr]]\n" - - "uadalp v16.4s, v12.8h\n" - "umull v12.8h, v1.8b, %[b0].8b\n" - "uadalp v17.4s, v13.8h\n" - "umull v13.8h, v1.8b, %[b1].8b\n" - "subs %w[k], %w[k], #1\n" - "uadalp v18.4s, v14.8h\n" - "umull v14.8h, v1.8b, %[b2].8b\n" - "uadalp v19.4s, v15.8h\n" - "umull v15.8h, v1.8b, %[b3].8b\n" - - "uadalp v20.4s, v12.8h\n" - "umull2 v12.8h, v1.16b, %[b0].16b\n" - "uadalp v21.4s, v13.8h\n" - "umull2 v13.8h, v1.16b, %[b1].16b\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "uadalp v22.4s, v14.8h\n" - "umull2 v14.8h, v1.16b, %[b2].16b\n" - "uadalp v23.4s, v15.8h\n" - "umull2 v15.8h, v1.16b, %[b3].16b\n" - "ldr q1, [%[a_ptr], #16]\n" - - "uadalp v20.4s, v12.8h\n" - "umull v12.8h, v2.8b, %[b0].8b\n" - "uadalp v21.4s, v13.8h\n" - "umull v13.8h, v2.8b, %[b1].8b\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "uadalp v22.4s, v14.8h\n" - "umull v14.8h, v2.8b, %[b2].8b\n" - "uadalp v23.4s, v15.8h\n" - "umull v15.8h, v2.8b, %[b3].8b\n" - - "uadalp v24.4s, v12.8h\n" - "umull2 v12.8h, v2.16b, %[b0].16b\n" - "uadalp v25.4s, v13.8h\n" - "umull2 v13.8h, v2.16b, %[b1].16b\n" - "uadalp v26.4s, v14.8h\n" - "umull2 v14.8h, v2.16b, %[b2].16b\n" - "uadalp v27.4s, v15.8h\n" - "umull2 v15.8h, v2.16b, %[b3].16b\n" - "ldr q2, [%[a_ptr], #32]\n" - - "uadalp v24.4s, v12.8h\n" - "umull v12.8h, v3.8b, %[b0].8b\n" - "uadalp v25.4s, v13.8h\n" - "umull v13.8h, v3.8b, %[b1].8b\n" - "uadalp v26.4s, v14.8h\n" - "umull v14.8h, v3.8b, %[b2].8b\n" - "uadalp v27.4s, v15.8h\n" - "umull v15.8h, v3.8b, %[b3].8b\n" - - "uadalp v28.4s, v12.8h\n" - "umull2 v12.8h, v3.16b, %[b0].16b\n" - "ldr %q[b0], [%[b_ptr]]\n" - "uadalp v29.4s, v13.8h\n" - "umull2 v13.8h, v3.16b, %[b1].16b\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "uadalp v30.4s, v14.8h\n" - "umull2 v14.8h, v3.16b, %[b2].16b\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "uadalp v31.4s, v15.8h\n" - "umull2 v15.8h, v3.16b, %[b3].16b\n" - "ldr %q[b3], [%[b_ptr], #48]\n" - - "uadalp v28.4s, v12.8h\n" - "umull v12.8h, v0.8b, %[b0].8b\n" - "add %[b_ptr], %[b_ptr], #64\n" - "uadalp v29.4s, v13.8h\n" - "umull v13.8h, v0.8b, %[b1].8b\n" - "ldr q3, [%[a_ptr], #48]\n" - "uadalp v30.4s, v14.8h\n" - "umull v14.8h, v0.8b, %[b2].8b\n" - "add %[a_ptr], %[a_ptr], #64\n" - "uadalp v31.4s, v15.8h\n" - "umull v15.8h, v0.8b, %[b3].8b\n" - "bne 1b\n" - - // Branch target - "2:\n" - "uadalp v16.4s, v12.8h\n" - "umull2 v12.8h, v0.16b, %[b0].16b\n" - "uadalp v17.4s, v13.8h\n" - "umull2 v13.8h, v0.16b, %[b1].16b\n" - "uadalp v18.4s, v14.8h\n" - "umull2 v14.8h, v0.16b, %[b2].16b\n" - "uadalp v19.4s, v15.8h\n" - "umull2 v15.8h, v0.16b, %[b3].16b\n" - - "uadalp v16.4s, v12.8h\n" - "umull v12.8h, v1.8b, %[b0].8b\n" - "uadalp v17.4s, v13.8h\n" - "umull v13.8h, v1.8b, %[b1].8b\n" - "uadalp v18.4s, v14.8h\n" - "umull v14.8h, v1.8b, %[b2].8b\n" - "uadalp v19.4s, v15.8h\n" - "umull v15.8h, v1.8b, %[b3].8b\n" - - "uadalp v20.4s, v12.8h\n" - "umull2 v12.8h, v1.16b, %[b0].16b\n" - "uadalp v21.4s, v13.8h\n" - "umull2 v13.8h, v1.16b, %[b1].16b\n" - "uadalp v22.4s, v14.8h\n" - "umull2 v14.8h, v1.16b, %[b2].16b\n" - "uadalp v23.4s, v15.8h\n" - "umull2 v15.8h, v1.16b, %[b3].16b\n" - - "uadalp v20.4s, v12.8h\n" - "umull v12.8h, v2.8b, %[b0].8b\n" - "uadalp v21.4s, v13.8h\n" - "umull v13.8h, v2.8b, %[b1].8b\n" - "uadalp v22.4s, v14.8h\n" - "umull v14.8h, v2.8b, %[b2].8b\n" - "uadalp v23.4s, v15.8h\n" - "umull v15.8h, v2.8b, %[b3].8b\n" - - "uadalp v24.4s, v12.8h\n" - "umull2 v12.8h, v2.16b, %[b0].16b\n" - "uadalp v25.4s, v13.8h\n" - "umull2 v13.8h, v2.16b, %[b1].16b\n" - "uadalp v26.4s, v14.8h\n" - "umull2 v14.8h, v2.16b, %[b2].16b\n" - "uadalp v27.4s, v15.8h\n" - "umull2 v15.8h, v2.16b, %[b3].16b\n" - - "uadalp v24.4s, v12.8h\n" - "umull v12.8h, v3.8b, %[b0].8b\n" - "uadalp v25.4s, v13.8h\n" - "umull v13.8h, v3.8b, %[b1].8b\n" - "uadalp v26.4s, v14.8h\n" - "umull v14.8h, v3.8b, %[b2].8b\n" - "uadalp v27.4s, v15.8h\n" - "umull v15.8h, v3.8b, %[b3].8b\n" - - "uadalp v28.4s, v12.8h\n" - "umull2 v12.8h, v3.16b, %[b0].16b\n" - "uadalp v29.4s, v13.8h\n" - "umull2 v13.8h, v3.16b, %[b1].16b\n" - "uadalp v30.4s, v14.8h\n" - "umull2 v14.8h, v3.16b, %[b2].16b\n" - "uadalp v31.4s, v15.8h\n" - "umull2 v15.8h, v3.16b, %[b3].16b\n" - - "uadalp v28.4s, v12.8h\n" - "uadalp v29.4s, v13.8h\n" - "uadalp v30.4s, v14.8h\n" - "uadalp v31.4s, v15.8h\n" - - "addp v16.4s, v16.4s, v17.4s\n" - "addp v17.4s, v18.4s, v19.4s\n" - "addp v18.4s, v20.4s, v21.4s\n" - "addp v19.4s, v22.4s, v23.4s\n" - "addp v20.4s, v24.4s, v25.4s\n" - "addp v21.4s, v26.4s, v27.4s\n" - "addp v22.4s, v28.4s, v29.4s\n" - "addp v23.4s, v30.4s, v31.4s\n" - - "addp v16.4s, v16.4s, v17.4s\n" - "addp v17.4s, v18.4s, v19.4s\n" - "addp v18.4s, v20.4s, v21.4s\n" - "addp v19.4s, v22.4s, v23.4s\n" - - "str q16, [%[c_ptr]]\n" - "str q17, [%[c_ptr], #16]\n" - "str q18, [%[c_ptr], #32]\n" - "str q19, [%[c_ptr], #48]\n" - "add %[c_ptr], %[c_ptr], #64\n" - - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3), - [k] "+r" (k) - : - : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19", - "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc"); - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp deleted file mode 100644 index 5e7684f692..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - -// Get the components we need to implement SGEMM. -// Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time. -#include "a64_hgemm_24x8/generic.hpp" -#include "a64_hgemm_24x8/a55r1.hpp" - -// 24x8 HGEMM "strategy" class. Describes the kernel properties. -// -// The generic "gemm_opt" function will instantiate one of these (allowing -// the constructor to pick a kernel implementation). -class hgemm_24x8 { -public: - typedef __fp16 operand_type; - typedef __fp16 result_type; - - typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); - - static const int A_block = 1; - static const int A_interleave = 8; - static const bool A_transpose = false; - - static const int B_block = 1; - static const int B_interleave = 24; - static const bool B_transpose = true; - - static const int out_width = 24; - static const int out_height = 8; - static const int k_unroll = 1; - - kern_type kernel = nullptr; - - hgemm_24x8(const struct CPUInfo *ci) { - kernel = a64_hgemm_asimd_24x8; - if (ci->CPU == CPUTarget::A55_DOT) { - kernel = a64_hgemm_asimd_24x8_a55r1; - } - } - -}; - -#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp deleted file mode 100644 index 1789abb046..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Copyright (c) 201 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include <arm_neon.h> - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k_iters = ((K+1)/2) - 1; - - for (int yb=0; yb<ablocks; yb++) { - const __fp16 *a_ptr0 = a_ptr; - const __fp16 *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - int k = k_iters; - a_ptr = a_ptr0; - - // As A55 requires 64-bit loads anyway, just use 64 bits of the - // "A" operands to save on "ins" instructions. Since A55 is - // in-order, two sets of "A" operands and one set of "B" is - // sufficient. - register float16x8_t a0 asm("v0"); - register float16x8_t a1 asm("v1"); - register float16x8_t a0a asm("v2"); - register float16x8_t a1a asm("v3"); - register float16x8_t b0 asm("v4"); - register float16x8_t b1 asm("v5"); - register float16x8_t b2 asm("v6"); - - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.8h, #0x0\n" - "ldr %d[a0], [%[a_ptr]]\n" - "movi v9.8h, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.8h, #0x0\n" - "ldr %d[a1], [%[a_ptr], #8]\n" - "movi v11.8h, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v12.8h, #0x0\n" - "movi v13.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v14.8h, #0x0\n" - "movi v15.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v16.8h, #0x0\n" - "movi v17.8h, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v18.8h, #0x0\n" - "movi v19.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v20.8h, #0x0\n" - "movi v21.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v22.8h, #0x0\n" - "movi v23.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v24.8h, #0x0\n" - "movi v25.8h, #0x0\n" - "movi v26.8h, #0x0\n" - "movi v27.8h, #0x0\n" - "movi v28.8h, #0x0\n" - "movi v29.8h, #0x0\n" - "movi v30.8h, #0x0\n" - "movi v31.8h, #0x0\n" - - // The loop is offset by these two instructions which must - // always be executed. - "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - "1:\n" - "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" - "subs %w[k], %w[k], #1\n" - "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" - "ldr %d[a0a], [%[a_ptr], #16]\n" - - "fmla v12.8h, %[b0].8h, %[a1].h[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" - "fmla v14.8h, %[b0].8h, %[a1].h[2]\n" - "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" - "ldr %d[a1a], [%[a_ptr], #24]\n" - - "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" - "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" - "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "fmla v20.8h, %[b1].8h, %[a1].h[0]\n" - "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v22.8h, %[b1].8h, %[a1].h[2]\n" - "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" - "ins %[b0].d[1], x20\n" - "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" - ASM_PREFETCH("[%[a_ptr], #128]") - - "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" - "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "fmla v30.8h, %[b2].8h, %[a1].h[2]\n" - "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" - "ldr %d[b2], [%[b_ptr], #80]\n" - - // Unroll 1 - "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n" - "ins %[b1].d[1], x20\n" - "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n" - "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" - "ldr %d[a0], [%[a_ptr], #32]\n" - - "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n" - "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n" - "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n" - "ldr %d[a1], [%[a_ptr], #40]\n" - - "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" - "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" - "ldr %d[b0], [%[b_ptr], #96]\n" - - "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n" - "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n" - "ldr x20, [%[b_ptr], #104]\n" - "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n" - "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n" - "ldr %d[b1], [%[b_ptr], #112]\n" - - "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n" - "ins %[b0].d[1], x20\n" - "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" - "ldr x20, [%[b_ptr], #120]\n" - "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" - - "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n" - "ins %[b1].d[1], x20\n" - "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" - "bne 1b\n" - - "4:\n" - - // Start final iteration - branch off to "odd" code before we load a0a - "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" - "cbnz %w[oddk], 2f\n" - - // Even K continuation - "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" - "ldr %d[a0a], [%[a_ptr], #16]\n" - - "fmla v12.8h, %[b0].8h, %[a1].h[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" - ASM_PREFETCHW("[%[c_ptr]]") - "fmla v14.8h, %[b0].8h, %[a1].h[2]\n" - "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" - "ldr %d[a1a], [%[a_ptr], #24]\n" - - "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" - "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" - ASM_PREFETCHW("[%[c_ptr], #64]") - "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "fmla v20.8h, %[b1].8h, %[a1].h[0]\n" - "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v22.8h, %[b1].8h, %[a1].h[2]\n" - "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" - "ins %[b0].d[1], x20\n" - "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" - ASM_PREFETCHW("[%[c_ptr], #128]") - - "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" - "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" - ASM_PREFETCHW("[%[c_ptr], #192]") - "fmla v30.8h, %[b2].8h, %[a1].h[2]\n" - "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" - "ldr %d[b2], [%[b_ptr], #80]\n" - - "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n" - "ins %[b1].d[1], x20\n" - "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n" - "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" - ASM_PREFETCHW("[%[c_ptr], #256]") - - "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n" - ASM_PREFETCHW("[%[c_ptr], #320]") - "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n" - "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n" - "ldr %d[a1], [%[a_ptr], #40]\n" - - "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" - ASM_PREFETCHWL2("[%[c_ptr], #384]") - "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" - ASM_PREFETCHWL2("[%[c_ptr], #448]") - - "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n" - "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n" - ASM_PREFETCHWL2("[%[c_ptr], #512]") - "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n" - "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n" - ASM_PREFETCHWL2("[%[c_ptr], #576]") - - "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n" - "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" - ASM_PREFETCHWL2("[%[c_ptr], #640]") - "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" - ASM_PREFETCHWL2("[%[c_ptr], #704]") - - "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n" - "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n" - "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n" - "b 3f\n" - - "2:\n" - - // Odd tail - "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" - ASM_PREFETCHW("[%[c_ptr]]") - - "fmla v12.8h, %[b0].8h, %[a1].h[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" - ASM_PREFETCHW("[%[c_ptr], #64]") - "fmla v14.8h, %[b0].8h, %[a1].h[2]\n" - "add %[a_ptr], %[a_ptr], #16\n" - "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" - ASM_PREFETCHW("[%[c_ptr], #128]") - - "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" - ASM_PREFETCHW("[%[c_ptr], #192]") - "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" - ASM_PREFETCHW("[%[c_ptr], #256]") - - "fmla v20.8h, %[b1].8h, %[a1].h[0]\n" - "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" - ASM_PREFETCHW("[%[c_ptr], #320]") - "fmla v22.8h, %[b1].8h, %[a1].h[2]\n" - "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" - ASM_PREFETCHWL2("[%[c_ptr], #384]") - - "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" - "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" - ASM_PREFETCHWL2("[%[c_ptr], #384]") - "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" - ASM_PREFETCHWL2("[%[c_ptr], #448]") - - "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" - ASM_PREFETCHWL2("[%[c_ptr], #512]") - "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" - ASM_PREFETCHWL2("[%[c_ptr], #576]") - "fmla v30.8h, %[b2].8h, %[a1].h[2]\n" - ASM_PREFETCHWL2("[%[c_ptr], #640]") - "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" - ASM_PREFETCHWL2("[%[c_ptr], #704]") - - // Common tail - // A55 won't dual issue these stores with anything else, so - // simplest to do them all in this common code. - "3:\n" - "str q8, [%[c_ptr]]\n" - "str q16, [%[c_ptr], #16]\n" - "str q24, [%[c_ptr], #32]\n" - "str q9, [%[c_ptr], #48]\n" - "str q17, [%[c_ptr], #64]\n" - "str q25, [%[c_ptr], #80]\n" - "str q10, [%[c_ptr], #96]\n" - "str q18, [%[c_ptr], #112]\n" - "str q26, [%[c_ptr], #128]\n" - "str q11, [%[c_ptr], #144]\n" - "str q19, [%[c_ptr], #160]\n" - "str q27, [%[c_ptr], #176]\n" - "str q12, [%[c_ptr], #192]\n" - "str q20, [%[c_ptr], #208]\n" - "str q28, [%[c_ptr], #224]\n" - "str q13, [%[c_ptr], #240]\n" - "str q21, [%[c_ptr], #256]\n" - "str q29, [%[c_ptr], #272]\n" - "str q14, [%[c_ptr], #288]\n" - "str q22, [%[c_ptr], #304]\n" - "str q30, [%[c_ptr], #320]\n" - "str q15, [%[c_ptr], #336]\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "5:\n" - "add %[c_ptr], %[c_ptr], #384\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "=w" (a0), [a0a] "=w" (a0a), [a1] "=w" (a1), [a1a] "=w" (a1a), - [b0] "=w" (b0), [b1] "=w" (b1), [b2] "=w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp deleted file mode 100644 index 03e2bb95a3..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include <arm_neon.h> - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - for (int yb=0; yb<ablocks; yb++) { - const __fp16 *a_ptr0 = a_ptr; - const __fp16 *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k = ((K+1)/2) - 1; - register float16x8_t a0 asm("v0"); - register float16x8_t a0a asm("v1"); - register float16x8_t b0 asm("v2"); - register float16x8_t b1 asm("v3"); - register float16x8_t b2 asm("v4"); - register float16x8_t b0a asm("v5"); - register float16x8_t b1a asm("v6"); - register float16x8_t b2a asm("v7"); - - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.8h, #0x0\n" - "ldr %q[a0], [%[a_ptr]]\n" - "movi v9.8h, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.8h, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v11.8h, #0x0\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "movi v12.8h, #0x0\n" - "ldr %q[b0a], [%[b_ptr], #48]\n" - "movi v13.8h, #0x0\n" - "ldr %q[b1a], [%[b_ptr], #64]\n" - "movi v14.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v15.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v16.8h, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v17.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v18.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v19.8h, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.8h, #0x0\n" - "movi v21.8h, #0x0\n" - "movi v22.8h, #0x0\n" - "movi v23.8h, #0x0\n" - "movi v24.8h, #0x0\n" - "movi v25.8h, #0x0\n" - "movi v26.8h, #0x0\n" - "movi v27.8h, #0x0\n" - "movi v28.8h, #0x0\n" - "movi v29.8h, #0x0\n" - "movi v30.8h, #0x0\n" - "movi v31.8h, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - "1:\n" - "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" - "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" - "ldr %q[a0a], [%[a_ptr], #16]\n" - "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" - "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" - "ldr %q[b2a], [%[b_ptr], #80]\n" - "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" - "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" - "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" - "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" - "ldr %q[b0], [%[b_ptr], #96]\n" - - "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" - "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" - "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" - "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" - "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - - "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" - "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" - ASM_PREFETCH("[%[b_ptr], #288]") - "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" - "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" - "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" - "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" - "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" - "ldr %q[a0], [%[a_ptr], #32]\n" - - "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n" - "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n" - "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n" - "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n" - "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n" - "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n" - "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n" - "ldr %q[b0a], [%[b_ptr], #48]\n" - - "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n" - "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" - ASM_PREFETCH("[%[b_ptr], #352]") - "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n" - "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n" - "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n" - "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n" - "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n" - "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n" - "ldr %q[b1a], [%[b_ptr], #64]\n" - - "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n" - "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n" - "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n" - "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n" - "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n" - "subs %w[k], %w[k], #1\n" - "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n" - "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n" - - "bne 1b\n" - "4:\n" - - // Jump to odd tail if necessary. - "cbnz %w[oddk], 2f\n" - - // Even tail. - "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" - "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" - "ldr %q[a0a], [%[a_ptr], #16]\n" - "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" - "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" - "ldr %q[b2a], [%[b_ptr], #80]\n" - "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" - "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" - "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" - "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" - - "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" - "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" - "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" - "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" - "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" - "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" - - "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" - "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" - "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" - "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" - "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" - "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" - "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" - "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" - - "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n" - "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n" - "str q8, [%[c_ptr]]\n" - "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n" - "str q16, [%[c_ptr], #16]\n" - - "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n" - "str q24, [%[c_ptr], #32]\n" - "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" - "str q9, [%[c_ptr], #48]\n" - "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n" - "str q17, [%[c_ptr], #64]\n" - - "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n" - "str q25, [%[c_ptr], #80]\n" - "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n" - "str q10, [%[c_ptr], #96]\n" - "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n" - "str q18, [%[c_ptr], #112]\n" - - "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n" - "str q26, [%[c_ptr], #128]\n" - "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n" - "str q11, [%[c_ptr], #144]\n" - "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n" - "str q19, [%[c_ptr], #160]\n" - - "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n" - "str q27, [%[c_ptr], #176]\n" - "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n" - "str q12, [%[c_ptr], #192]\n" - "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n" - "str q20, [%[c_ptr], #208]\n" - - "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n" - "str q28, [%[c_ptr], #224]\n" - "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n" - "str q13, [%[c_ptr], #240]\n" - "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n" - "str q21, [%[c_ptr], #256]\n" - - "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n" - "str q29, [%[c_ptr], #272]\n" - "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n" - "str q14, [%[c_ptr], #288]\n" - "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n" - "str q22, [%[c_ptr], #304]\n" - - "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n" - "str q30, [%[c_ptr], #320]\n" - "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n" - "str q15, [%[c_ptr], #336]\n" - "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n" - "b 3f\n" - - // Odd tail - "2:\n" - "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" - "add %[a_ptr], %[a_ptr], #16\n" - "str q8, [%[c_ptr]]\n" - "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" - "str q16, [%[c_ptr], #16]\n" - - "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" - "str q24, [%[c_ptr], #32]\n" - "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" - "str q9, [%[c_ptr], #48]\n" - "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" - "str q17, [%[c_ptr], #64]\n" - - "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" - "str q25, [%[c_ptr], #80]\n" - "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" - "str q10, [%[c_ptr], #96]\n" - "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" - "str q18, [%[c_ptr], #112]\n" - - "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" - "str q26, [%[c_ptr], #128]\n" - "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" - "str q11, [%[c_ptr], #144]\n" - "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" - "str q19, [%[c_ptr], #160]\n" - - "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" - "str q27, [%[c_ptr], #176]\n" - "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" - "str q12, [%[c_ptr], #192]\n" - "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" - "str q20, [%[c_ptr], #208]\n" - - "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" - "str q28, [%[c_ptr], #224]\n" - "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" - "str q13, [%[c_ptr], #240]\n" - "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" - "str q21, [%[c_ptr], #256]\n" - - "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" - "str q29, [%[c_ptr], #272]\n" - "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" - "str q14, [%[c_ptr], #288]\n" - "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" - "str q22, [%[c_ptr], #304]\n" - - "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" - "str q30, [%[c_ptr], #320]\n" - "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" - "str q15, [%[c_ptr], #336]\n" - "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" - - "3:\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a0a] "+w" (a0a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k), - [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp deleted file mode 100644 index 603ad8dc0a..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Actual kernel implementations -#include "a64_sgemm_12x8/generic.hpp" -#include "a64_sgemm_12x8/a53.hpp" -#include "a64_sgemm_12x8/a55.hpp" -#include "a64_sgemm_12x8/a55r1.hpp" - - -// 12x8 SGEMM "strategy" class. -// -// This describes the characteristics of a family of kernels, in terms of -// the required interleave properties and the output block size. -// -// All kernels in the family must share these characteristics. The actual -// kernel to be used can be chosen at runtime, based on the CPU_type -// structure. -class sgemm_12x8 { -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, const float *, float *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 1; - static const int A_transpose = 0; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 1; - static const int B_transpose = 1; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 1; - - kern_type kernel{nullptr}; - - sgemm_12x8(const CPUInfo *ci) { - kernel = a64_sgemm_asimd_12x8; - if (ci->CPU == CPUTarget::A53) { - kernel = a64_sgemm_asimd_12x8_a53; - } - else if (ci->CPU == CPUTarget::A55) { - kernel = a64_sgemm_asimd_12x8_a55; - } - else if (ci->CPU == CPUTarget::A55_DOT) { - kernel = a64_sgemm_asimd_12x8_a55r1; - } - } -}; - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp deleted file mode 100644 index 1c9b4b38fc..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k = ((K+1)/2) - 1; - - register float32x4_t a0 asm("v0"); - register float32x4_t a1 asm("v1"); - register float32x4_t b0 asm("v2"); - register float32x4_t b1 asm("v3"); - register float32x4_t b2 asm("v4"); - register float32x4_t a0a asm("v5"); - register float32x4_t a1a asm("v6"); - - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldr %q[a0], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "ldr %q[a1], [%[a_ptr], #16]\n" - "movi v11.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - "1:\n" - // Unroll 0 - "ldr %d[b2], [%[b_ptr], #32]\n" - "nop\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "subs %w[k], %w[k], #1\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - - "ldr %d[a0a], [%[a_ptr], #32]\n" - "ins %[b2].d[1], x20\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ldr x20, [%[a_ptr], #40]\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - - "ldr %d[a1a], [%[a_ptr], #48]\n" - "ins %[a0a].d[1], x20\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "ldr x20, [%[a_ptr], #56]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - - "ldr %d[b0], [%[b_ptr], #48]\n" - "ins %[a1a].d[1], x20\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - - ASM_PREFETCH("[%[a_ptr], #320]") - "ins %[b0].d[1], x20\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - - ASM_PREFETCH("[%[b_ptr], #448]") - "nop\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - - "ldr %d[b1], [%[b_ptr], #64]\n" - "nop\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - - ASM_PREFETCH("[%[b_ptr], #512]") - "ins %[b1].d[1], x20\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - // Unroll 1 - "ldr %d[b2], [%[b_ptr], #80]\n" - "nop\n" - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - - "ldr %d[a0], [%[a_ptr], #64]\n" - "ins %[b2].d[1], x20\n" - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "ldr x20, [%[a_ptr], #72]\n" - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - - "ldr %d[a1], [%[a_ptr], #80]\n" - "ins %[a0].d[1], x20\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "ldr x20, [%[a_ptr], #88]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - - "ldr %d[b0], [%[b_ptr], #96]\n" - "ins %[a1].d[1], x20\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #104]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - - "nop\n" - "ins %[b0].d[1], x20\n" - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - - "nop\n" - "nop\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - - "ldr %d[b1], [%[b_ptr], #112]\n" - "nop\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "ldr x20, [%[b_ptr], #120]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "add %[b_ptr], %[b_ptr], #96\n" - - "nop\n" - "ins %[b1].d[1], x20\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - - "bne 1b\n" - - // Branch here if K=1 or 2. Do the right thing for odd/even at the end. - "4:\n" - "cbnz %w[oddk], 2f\n" - - // Detached final iteration. (even K) - "ldr %d[b2], [%[b_ptr], #32]\n" - "nop\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "subs %w[k], %w[k], #1\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - - "ldr %d[a0a], [%[a_ptr], #32]\n" - "ins %[b2].d[1], x20\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ldr x20, [%[a_ptr], #40]\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - - "ldr %d[a1a], [%[a_ptr], #48]\n" - "ins %[a0a].d[1], x20\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "ldr x20, [%[a_ptr], #56]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - - "ldr %d[b0], [%[b_ptr], #48]\n" - "ins %[a1a].d[1], x20\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - - "ins %[b0].d[1], x20\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - - "nop\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - - "ldr %d[b1], [%[b_ptr], #64]\n" - "nop\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - - "ins %[b1].d[1], x20\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - "ldr %d[b2], [%[b_ptr], #80]\n" - "nop\n" - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - - "ins %[b2].d[1], x20\n" - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - "b 3f\n" - - // Detached final iteration. (odd K) - "2:\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "nop\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - - "ins %[b2].d[1], x20\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - // Common tail - "3:\n" - "str q8, [%[c_ptr]]\n" - "str q16, [%[c_ptr], #16]\n" - "str q24, [%[c_ptr], #32]\n" - "str q9, [%[c_ptr], #48]\n" - "str q17, [%[c_ptr], #64]\n" - "str q25, [%[c_ptr], #80]\n" - "str q10, [%[c_ptr], #96]\n" - "str q18, [%[c_ptr], #112]\n" - "str q26, [%[c_ptr], #128]\n" - "str q11, [%[c_ptr], #144]\n" - "str q19, [%[c_ptr], #160]\n" - "str q27, [%[c_ptr], #176]\n" - "str q12, [%[c_ptr], #192]\n" - "str q20, [%[c_ptr], #208]\n" - "str q28, [%[c_ptr], #224]\n" - "str q13, [%[c_ptr], #240]\n" - "str q21, [%[c_ptr], #256]\n" - "str q29, [%[c_ptr], #272]\n" - "str q14, [%[c_ptr], #288]\n" - "str q22, [%[c_ptr], #304]\n" - "str q30, [%[c_ptr], #320]\n" - "str q15, [%[c_ptr], #336]\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" - ); - } - } -} - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp deleted file mode 100644 index 85d8a502f8..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -inline void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k = ((K+1)/2) - 1; - - register float32x4_t a0 asm("v0"); - register float32x4_t a1 asm("v1"); - register float32x4_t b0 asm("v2"); - register float32x4_t b1 asm("v3"); - register float32x4_t b2 asm("v4"); - register float32x4_t a0a asm("v5"); - register float32x4_t a1a asm("v6"); - - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldr %q[a0], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "ldr %q[a1], [%[a_ptr], #16]\n" - "movi v11.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - "1:\n" - // Unroll 0 - "ldr %d[b2], [%[b_ptr], #32]\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "subs %w[k], %w[k], #1\n" - - - "ldr %d[a0a], [%[a_ptr], #32]\n" - "ins %[b2].d[1], x20\n" - - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - - "ldr %d[a1a], [%[a_ptr], #48]\n" - "ins %[a0a].d[1], x20\n" - - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - - "ldr %d[b0], [%[b_ptr], #48]\n" - "ins %[a1a].d[1], x20\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - - "ldr %d[b1], [%[b_ptr], #64]\n" - "ins %[b0].d[1], x20\n" - - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - - - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - ASM_PREFETCH("[%[b_ptr], #512]") - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - // Unroll 1 - "ldr %d[b2], [%[b_ptr], #80]\n" - "ins %[b1].d[1], x20\n" - - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - - "ldr %d[a0], [%[a_ptr], #64]\n" - "ins %[b2].d[1], x20\n" - - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "ldr x20, [%[a_ptr], #72]\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - - "ldr %d[a1], [%[a_ptr], #80]\n" - "ins %[a0].d[1], x20\n" - - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "ldr x20, [%[a_ptr], #88]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - - - "ldr %d[b0], [%[b_ptr], #96]\n" - "ins %[a1].d[1], x20\n" - - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "ldr x20, [%[b_ptr], #104]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - - "ldr %d[b1], [%[b_ptr], #112]\n" - "ins %[b0].d[1], x20\n" - - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #120]\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "add %[a_ptr], %[a_ptr], #64\n" - - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - - - "ldr %d[b2], [%[b_ptr], #32]\n" - "ins %[b1].d[1], x20\n" - - - "bne 1b\n" - - // Branch here if K=1 or 2. Do the right thing for odd/even at the end. - "4:\n" - "cbnz %w[oddk], 2f\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - - // Detached final iteration. (even K) - "ldr x20, [%[b_ptr], #40]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "subs %w[k], %w[k], #1\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - - "ldr %d[a0a], [%[a_ptr], #32]\n" - "ins %[b2].d[1], x20\n" - - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - - "ldr %d[a1a], [%[a_ptr], #48]\n" - "ins %[a0a].d[1], x20\n" - - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - - "ldr %d[b0], [%[b_ptr], #48]\n" - "ins %[a1a].d[1], x20\n" - - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - - "ldr %d[b1], [%[b_ptr], #64]\n" - "ins %[b0].d[1], x20\n" - - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - "ldr %d[b2], [%[b_ptr], #80]\n" - "ins %[b1].d[1], x20\n" - - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - - "ins %[b2].d[1], x20\n" - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - "b 3f\n" - - // Detached final iteration. (odd K) - "2:\n" - - "ldr %d[b2], [%[b_ptr], #32]\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ins %[b2].d[1], x20\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - // Common tail - "3:\n" - "str q8, [%[c_ptr]]\n" - "str q16, [%[c_ptr], #16]\n" - "str q24, [%[c_ptr], #32]\n" - "str q9, [%[c_ptr], #48]\n" - "str q17, [%[c_ptr], #64]\n" - "str q25, [%[c_ptr], #80]\n" - "str q10, [%[c_ptr], #96]\n" - "str q18, [%[c_ptr], #112]\n" - "str q26, [%[c_ptr], #128]\n" - "str q11, [%[c_ptr], #144]\n" - "str q19, [%[c_ptr], #160]\n" - "str q27, [%[c_ptr], #176]\n" - "str q12, [%[c_ptr], #192]\n" - "str q20, [%[c_ptr], #208]\n" - "str q28, [%[c_ptr], #224]\n" - "str q13, [%[c_ptr], #240]\n" - "str q21, [%[c_ptr], #256]\n" - "str q29, [%[c_ptr], #272]\n" - "str q14, [%[c_ptr], #288]\n" - "str q22, [%[c_ptr], #304]\n" - "str q30, [%[c_ptr], #320]\n" - "str q15, [%[c_ptr], #336]\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp deleted file mode 100644 index 295308053f..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -inline void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k = ((K+1)/2) - 1; - - register float32x4_t a0 asm("v0"); - register float32x4_t a1 asm("v1"); - register float32x4_t b0 asm("v2"); - register float32x4_t b1 asm("v3"); - register float32x4_t b2 asm("v4"); - register float32x4_t a0a asm("v5"); - register float32x4_t a1a asm("v6"); - - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "ldp %q[a0], %q[a1], [%[a_ptr]]\n" - ASM_PREFETCH("[%[a_ptr], #64]") - - ASM_PREFETCH("[%[a_ptr], #128]") - ASM_PREFETCH("[%[a_ptr], #192]") - "ldp %q[b0], %q[b1], [%[b_ptr]]\n" - ASM_PREFETCH("[%[b_ptr], #64]") - - ASM_PREFETCH("[%[b_ptr], #128]") - ASM_PREFETCH("[%[b_ptr], #192]") - ASM_PREFETCH("[%[b_ptr], #256]") - - ASM_PREFETCH("[%[a_ptr], #256]") - ASM_PREFETCH("[%[a_ptr], #320]") - ASM_PREFETCH("[%[a_ptr], #384]") - - ASM_PREFETCH("[%[b_ptr], #320]") - ASM_PREFETCH("[%[b_ptr], #384]") - ASM_PREFETCH("[%[b_ptr], #448]") - ASM_PREFETCH("[%[b_ptr], #512]") - - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - "movi v13.4s, #0x0\n" - "movi v14.4s, #0x0\n" - "movi v15.4s, #0x0\n" - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - "1:\n" - // Unroll 0 - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "subs %w[k], %w[k], #1\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ldr %d[a0a], [%[a_ptr], #32]\n" - - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "ldr %d[a1a], [%[a_ptr], #48]\n" - - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "ins %[a0a].d[1], x20\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "ins %[a1a].d[1], x20\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "ins %[b0].d[1], x20\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - ASM_PREFETCH("[%[a_ptr], #448]") - - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - ASM_PREFETCH("[%[b_ptr], #576]") - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - // Unroll 1 - "ldr %d[b2], [%[b_ptr], #80]\n" - - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "ins %[b1].d[1], x20\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "ldr %d[a0], [%[a_ptr], #64]\n" - - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "ldr x20, [%[a_ptr], #72]\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "ldr %d[a1], [%[a_ptr], #80]\n" - - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "ins %[a0].d[1], x20\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "ldr x20, [%[a_ptr], #88]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - "ldr %d[b0], [%[b_ptr], #96]\n" - - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "ins %[a1].d[1], x20\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "ldr x20, [%[b_ptr], #104]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "ldr %d[b1], [%[b_ptr], #112]\n" - - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "ins %[b0].d[1], x20\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #120]\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "add %[a_ptr], %[a_ptr], #64\n" - - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - ASM_PREFETCH("[%[b_ptr], #640]") - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "ins %[b1].d[1], x20\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - - - "bne 1b\n" - - // Branch here if K=1 or 2. Do the right thing for odd/even at the end. - "4:\n" - "cbnz %w[oddk], 2f\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - - // Detached final iteration. (even K) - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "subs %w[k], %w[k], #1\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ldr %d[a0a], [%[a_ptr], #32]\n" - - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "ldr x20, [%[a_ptr], #40]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "ldr %d[a1a], [%[a_ptr], #48]\n" - - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "ins %[a0a].d[1], x20\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "ldr x20, [%[a_ptr], #56]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "ldr %d[b0], [%[b_ptr], #48]\n" - - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "ins %[a1a].d[1], x20\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "ldr x20, [%[b_ptr], #56]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "ldr %d[b1], [%[b_ptr], #64]\n" - - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "ins %[b0].d[1], x20\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "ldr x20, [%[b_ptr], #72]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - "ldr %d[b2], [%[b_ptr], #80]\n" - - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "ins %[b1].d[1], x20\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "ldr x20, [%[b_ptr], #88]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - "ins %[b2].d[1], x20\n" - - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - "b 3f\n" - - // Detached final iteration. (odd K) - "2:\n" - - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "ldr %d[b2], [%[b_ptr], #32]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr x20, [%[b_ptr], #40]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "ins %[b2].d[1], x20\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "add %[a_ptr], %[a_ptr], #32\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - - // Common tail - "3:\n" - "str q8, [%[c_ptr]]\n" - "str q16, [%[c_ptr], #16]\n" - "str q24, [%[c_ptr], #32]\n" - "str q9, [%[c_ptr], #48]\n" - "str q17, [%[c_ptr], #64]\n" - "str q25, [%[c_ptr], #80]\n" - "str q10, [%[c_ptr], #96]\n" - "str q18, [%[c_ptr], #112]\n" - "str q26, [%[c_ptr], #128]\n" - "str q11, [%[c_ptr], #144]\n" - "str q19, [%[c_ptr], #160]\n" - "str q27, [%[c_ptr], #176]\n" - "str q12, [%[c_ptr], #192]\n" - "str q20, [%[c_ptr], #208]\n" - "str q28, [%[c_ptr], #224]\n" - "str q13, [%[c_ptr], #240]\n" - "str q21, [%[c_ptr], #256]\n" - "str q29, [%[c_ptr], #272]\n" - "str q14, [%[c_ptr], #288]\n" - "str q22, [%[c_ptr], #304]\n" - "str q30, [%[c_ptr], #320]\n" - "str q15, [%[c_ptr], #336]\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp deleted file mode 100644 index c4a5875a31..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include <arm_neon.h> - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb<ablocks; yb++) { - const float *a_ptr0 = a_ptr; - const float *b_ptr = Bpanel; - - for (int xb=0; xb<bblocks; xb++) { - a_ptr = a_ptr0; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k = ((K+1)/2) - 1; - - register float32x4_t a0 asm("v0"); - register float32x4_t a1 asm("v1"); - register float32x4_t b0 asm("v2"); - register float32x4_t b1 asm("v3"); - register float32x4_t b2 asm("v4"); - register float32x4_t a0a asm("v5"); - register float32x4_t a1a asm("v6"); - - __asm __volatile ( - // Initialize result registers, load initial operands, prime prefetches. - "movi v8.4s, #0x0\n" - "ldr %q[a0], [%[a_ptr]]\n" - "movi v9.4s, #0x0\n" - "ldr %q[b0], [%[b_ptr]]\n" - "movi v10.4s, #0x0\n" - "ldr %q[a1], [%[a_ptr], #16]\n" - "movi v11.4s, #0x0\n" - "ldr %q[b1], [%[b_ptr], #16]\n" - "movi v12.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v13.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v14.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v15.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #256]") - "movi v18.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "movi v20.4s, #0x0\n" - ASM_PREFETCH("[%[a_ptr], #256]") - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[b_ptr], #384]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip loop if we are doing zero iterations of it. - "cbz %w[k], 4f\n" - - // Loop proper - "1:\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ldr %q[a0a], [%[a_ptr], #32]\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "ldr %q[a1a], [%[a_ptr], #48]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "ldr %q[b0], [%[b_ptr], #48]\n" - - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "ldr %q[b1], [%[b_ptr], #64]\n" - - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - "ldr %q[b2], [%[b_ptr], #80]\n" - - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "ldr %q[a0], [%[a_ptr], #64]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "ldr %q[a1], [%[a_ptr], #80]\n" - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "ldr %q[b0], [%[b_ptr], #96]\n" - - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - ASM_PREFETCH("[%[b_ptr], #512]") - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "ldr %q[b1], [%[b_ptr], #112]\n" - - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "subs %w[k], %w[k], #1\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - "bne 1b\n" - - // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) - "4:\n" - - // Branch to alternative tail for odd K - "cbnz %w[oddk], 2f\n" - - // Detached final iteration (even K) - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "ldr %q[a0a], [%[a_ptr], #32]\n" - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "ldr %q[a1a], [%[a_ptr], #48]\n" - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "ldr %q[b0], [%[b_ptr], #48]\n" - - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "ldr %q[b1], [%[b_ptr], #64]\n" - - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "add %[a_ptr], %[a_ptr], #64\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - "ldr %q[b2], [%[b_ptr], #80]\n" - - "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" - "add %[b_ptr], %[b_ptr], %[block_jump]\n" - "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" - "add %[b_ptr], %[b_ptr], #96\n" - "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "str q8, [%[c_ptr], #0]\n" - "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" - "str q16, [%[c_ptr], #16]\n" - "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" - "str q24, [%[c_ptr], #32]\n" - - "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" - "str q9, [%[c_ptr], #48]\n" - "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" - "str q17, [%[c_ptr], #64]\n" - "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" - "str q25, [%[c_ptr], #80]\n" - "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" - "str q18, [%[c_ptr], #112]\n" - "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" - "str q26, [%[c_ptr], #128]\n" - "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" - "str q19, [%[c_ptr], #160]\n" - "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" - "str q27, [%[c_ptr], #176]\n" - "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" - "str q20, [%[c_ptr], #208]\n" - "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" - "str q28, [%[c_ptr], #224]\n" - "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" - "str q21, [%[c_ptr], #256]\n" - "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" - "str q29, [%[c_ptr], #272]\n" - "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" - "str q22, [%[c_ptr], #304]\n" - "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" - "str q30, [%[c_ptr], #320]\n" - "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" - "str q15, [%[c_ptr], #336]\n" - - "b 3f\n" - - // Detached final iteration (odd K) - "2:\n" - "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" - "ldr %q[b2], [%[b_ptr], #32]\n" - "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" - "add %[b_ptr], %[b_ptr], %[row_jump]\n" - "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" - "str q8, [%[c_ptr], #0]\n" - "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" - "str q16, [%[c_ptr], #16]\n" - "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" - "add %[b_ptr], %[b_ptr], #48\n" - "add %[a_ptr], %[a_ptr], #32\n" - "str q24, [%[c_ptr], #32]\n" - "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" - "str q9, [%[c_ptr], #48]\n" - - "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" - "str q17, [%[c_ptr], #64]\n" - "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" - "str q25, [%[c_ptr], #80]\n" - "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" - "str q10, [%[c_ptr], #96]\n" - - "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" - "str q18, [%[c_ptr], #112]\n" - "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" - "str q26, [%[c_ptr], #128]\n" - "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" - "str q11, [%[c_ptr], #144]\n" - - "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" - "str q19, [%[c_ptr], #160]\n" - "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" - "str q27, [%[c_ptr], #176]\n" - "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" - "str q12, [%[c_ptr], #192]\n" - - "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" - "str q20, [%[c_ptr], #208]\n" - "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" - "str q28, [%[c_ptr], #224]\n" - "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" - "str q13, [%[c_ptr], #240]\n" - - "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" - "str q21, [%[c_ptr], #256]\n" - "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" - "str q29, [%[c_ptr], #272]\n" - "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" - "str q14, [%[c_ptr], #288]\n" - - "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" - "str q22, [%[c_ptr], #304]\n" - "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" - "str q30, [%[c_ptr], #320]\n" - "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" - "str q15, [%[c_ptr], #336]\n" - - // Common tail - "3:\n" - "str q23, [%[c_ptr], #352]\n" - "str q31, [%[c_ptr], #368]\n" - "add %[c_ptr], %[c_ptr], #384\n" - : - [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), - [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), - [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) - : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump) - : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" - ); - } - } -} - -inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0); -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp deleted file mode 100644 index 2a39ca1f07..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Actual kernel implementations -#include "generic.hpp" - -// Transposed SGEMV strategy class. -class sgemv_trans { -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int); - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int k_unroll = 1; - - kern_type kernel; - - sgemv_trans(const CPUInfo *ci) { - kernel = a64_sgemv_trans; - } -}; - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp deleted file mode 100644 index 33f2b701cf..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp +++ /dev/null @@ -1,913 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include <arm_neon.h> -#include "asmlib.hpp" - -// Kernel implementation - transposed GEMV -// -// The kernel will process "M" rows of A (= steps of dot product) and "N" -// columns (= dot products total) -// -// General plan is to do as many columns simultaneously as possible - a -// reasonable limit is half the NEON regfile = 64 total accumulators. -// -// It's possible that messing around with sub-blocking M and N can yield -// higher performance, but that's left to the outer loop. In this kernel we -// process all of M at the same time. - - -// How far ahead to prefetch for the first and subsequent prefetches. -// These values work for A72 on JunoR2... - -#define FIRST_PFD 9 -#define PFD 6 - -inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) { - const float *a_ptr_base = Astart; - float *y_ptr = Ystart; - - register const float32x4_t va asm("v1") = vdupq_n_f32(alpha); - - int firstpfd=FIRST_PFD; - if (firstpfd > M) { - firstpfd = (M-1); - } - - int pfd = PFD; - if (pfd > M) { - pfd = (M-1); - } - - ptrdiff_t jump = lda * sizeof(int); - - for (;N>=96;N-=96) { - int k = M-1; - - const float *a_ptr = a_ptr_base; - const float *x_ptr = Xstart; - const float *pf_ptr = a_ptr; - const float *firstpf_ptr = a_ptr; - const float *pf_limit = a_ptr + (M * lda); - - for (int i=0; i<firstpfd; i++) { - prefetch_1x(firstpf_ptr); - firstpf_ptr += lda; - } - - for (int i=0; i<pfd; i++) { - prefetch_5x(pf_ptr + 16); - pf_ptr += lda; - } - - a_ptr_base += 96; - - __asm __volatile ( - "movi v8.4s,#0x0\n" - "ldr w0, [%[x_ptr]]\n" - "movi v9.4s,#0x0\n" - "ldr q2, [%[a_ptr], #0]\n" - "movi v10.4s,#0x0\n" - "ldr q3, [%[a_ptr], #0x10]\n" - "movi v11.4s,#0x0\n" - "ldr q4, [%[a_ptr], #0x20]\n" - "movi v12.4s,#0x0\n" - "ldr q5, [%[a_ptr], #0x30]\n" - "movi v13.4s,#0x0\n" - "ldr q6, [%[a_ptr], #0x40]\n" - "movi v14.4s,#0x0\n" - "ldr q7, [%[a_ptr], #0x50]\n" - "movi v15.4s,#0x0\n" - ASM_PREFETCH("[%[firstpf_ptr]]") - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - ASM_PREFETCH("[%[pf_ptr], #64]") - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - ASM_PREFETCH("[%[pf_ptr], #128]") - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - ASM_PREFETCH("[%[pf_ptr], #192]") - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - ASM_PREFETCH("[%[pf_ptr], #256]") - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - ASM_PREFETCH("[%[pf_ptr], #320]") - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "add %[pf_ptr], %[pf_ptr], %[jump]\n" - "movi v28.4s, #0x0\n" - "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - - // Skip everything if there are no iterations of the main loop to do. - "cbz %w[k], 10f\n" - - // Loop with all prefetches. Exit this loop when firstpf_ptr - // hits pf_limit. - "1:\n" - "dup v0.4s, w0\n" - "ldr w0, [%[x_ptr], #4]\n" - "add %[x_ptr], %[x_ptr], #0x4\n" - "fmla v8.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x60]\n" - "fmla v9.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x70]\n" - ASM_PREFETCH("[%[firstpf_ptr]]") - "fmla v10.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x80]\n" - "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n" - "fmla v11.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x90]\n" - "sub %w[k], %w[k], #1\n" - ASM_PREFETCH("[%[x_ptr], #128]") - "fmla v12.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0xa0]\n" - "fmla v13.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0xb0]\n" - ASM_PREFETCH("[%[pf_ptr], #0x40]") - "fmla v14.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0xc0]\n" - "fmla v15.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0xd0]\n" - "fmla v16.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0xe0]\n" - "fmla v17.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0xf0]\n" - ASM_PREFETCH("[%[pf_ptr], #0x80]") - "fmla v18.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x100]\n" - "fmla v19.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x110]\n" - "fmla v20.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x120]\n" - "fmla v21.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x130]\n" - ASM_PREFETCH("[%[pf_ptr], #0xc0]") - "fmla v22.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x140]\n" - "fmla v23.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x150]\n" - "fmla v24.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x160]\n" - "fmla v25.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x170]\n" - ASM_PREFETCH("[%[pf_ptr], #0x100]") - "add %[a_ptr], %[a_ptr], %[jump]\n" - "fmla v26.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x00]\n" - "fmla v27.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x10]\n" - "fmla v28.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x20]\n" - "fmla v29.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x30]\n" - ASM_PREFETCH("[%[pf_ptr], #0x140]") - "fmla v30.4s, v6.4s, v0.4s\n" - "add %[pf_ptr], %[pf_ptr], %[jump]\n" - "ldr q6, [%[a_ptr], #0x40]\n" - "fmla v31.4s, v7.4s, v0.4s\n" - "cmp %[firstpf_ptr], %[pf_limit]\n" - "ldr q7, [%[a_ptr], #0x50]\n" - "blt 1b\n" - - // Check that there are still "main" prefetches to do. - "cmp %[pf_ptr], %[pf_limit]\n" - "bge 9f\n" - - // Just the main prefetches, exit this loop when pf_ptr hits pf_limit. - "8:\n" - "dup v0.4s, w0\n" - "ldr w0, [%[x_ptr], #4]\n" - "add %[x_ptr], %[x_ptr], #0x4\n" - "fmla v8.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x60]\n" - "fmla v9.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x70]\n" - "fmla v10.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x80]\n" - "fmla v11.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x90]\n" - "sub %w[k], %w[k], #1\n" - ASM_PREFETCH("[%[x_ptr], #128]") - "fmla v12.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0xa0]\n" - "fmla v13.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0xb0]\n" - ASM_PREFETCH("[%[pf_ptr], #0x40]") - "fmla v14.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0xc0]\n" - "fmla v15.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0xd0]\n" - "fmla v16.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0xe0]\n" - "fmla v17.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0xf0]\n" - ASM_PREFETCH("[%[pf_ptr], #0x80]") - "fmla v18.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x100]\n" - "fmla v19.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x110]\n" - "fmla v20.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x120]\n" - "fmla v21.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x130]\n" - ASM_PREFETCH("[%[pf_ptr], #0xc0]") - "fmla v22.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x140]\n" - "fmla v23.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x150]\n" - "fmla v24.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x160]\n" - "fmla v25.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x170]\n" - ASM_PREFETCH("[%[pf_ptr], #0x100]") - "add %[a_ptr], %[a_ptr], %[jump]\n" - "fmla v26.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x00]\n" - "fmla v27.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x10]\n" - "fmla v28.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x20]\n" - "fmla v29.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x30]\n" - ASM_PREFETCH("[%[pf_ptr], #0x140]") - "fmla v30.4s, v6.4s, v0.4s\n" - "add %[pf_ptr], %[pf_ptr], %[jump]\n" - "ldr q6, [%[a_ptr], #0x40]\n" - "fmla v31.4s, v7.4s, v0.4s\n" - "cmp %[pf_ptr], %[pf_limit]\n" - "ldr q7, [%[a_ptr], #0x50]\n" - "blt 8b\n" - - // Check that there is still work to do. - "9:\n" - "cmp %w[k], #0\n" - "beq 10f\n" - - // Loop without prefetches, exit when k hits 0. - "2:\n" - "dup v0.4s, w0\n" - "ldr w0, [%[x_ptr], #4]\n" - "add %[x_ptr], %[x_ptr], #0x4\n" - "fmla v8.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x60]\n" - "fmla v9.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x70]\n" - "fmla v10.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x80]\n" - "fmla v11.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x90]\n" - "subs %w[k], %w[k], #1\n" - "fmla v12.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0xa0]\n" - "fmla v13.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0xb0]\n" - "fmla v14.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0xc0]\n" - "fmla v15.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0xd0]\n" - "fmla v16.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0xe0]\n" - "fmla v17.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0xf0]\n" - "fmla v18.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x100]\n" - "fmla v19.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x110]\n" - "fmla v20.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x120]\n" - "fmla v21.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x130]\n" - "fmla v22.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x140]\n" - "fmla v23.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x150]\n" - "fmla v24.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x160]\n" - "fmla v25.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x170]\n" - "add %[a_ptr], %[a_ptr], %[jump]\n" - "fmla v26.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x00]\n" - "fmla v27.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x10]\n" - "fmla v28.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x20]\n" - "fmla v29.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x30]\n" - "fmla v30.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x40]\n" - "fmla v31.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x50]\n" - "bne 2b\n" - - "10:\n" - - // Final iteration - "dup v0.4s, w0\n" - "fmla v8.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x60]\n" - "fmla v9.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x70]\n" - "fmla v10.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x80]\n" - "fmla v11.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x90]\n" - "fmla v12.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0xa0]\n" - "fmla v13.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0xb0]\n" - "fmla v14.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0xc0]\n" - "fmla v15.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0xd0]\n" - "fmla v16.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0xe0]\n" - "fmla v17.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0xf0]\n" - "fmla v18.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x100]\n" - "fmla v19.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x110]\n" - "fmla v20.4s, v2.4s, v0.4s\n" - "ldr q2, [%[a_ptr], #0x120]\n" - "fmla v21.4s, v3.4s, v0.4s\n" - "ldr q3, [%[a_ptr], #0x130]\n" - "fmla v22.4s, v4.4s, v0.4s\n" - "ldr q4, [%[a_ptr], #0x140]\n" - "fmla v23.4s, v5.4s, v0.4s\n" - "ldr q5, [%[a_ptr], #0x150]\n" - "fmla v24.4s, v6.4s, v0.4s\n" - "ldr q6, [%[a_ptr], #0x160]\n" - "fmla v25.4s, v7.4s, v0.4s\n" - "ldr q7, [%[a_ptr], #0x170]\n" - "fmla v26.4s, v2.4s, v0.4s\n" - "ldr q2, [%[y_ptr]]\n" - "fmla v27.4s, v3.4s, v0.4s\n" - "ldr q3, [%[y_ptr], #0x10]\n" - "fmla v28.4s, v4.4s, v0.4s\n" - "ldr q4, [%[y_ptr], #0x20]\n" - "fmla v29.4s, v5.4s, v0.4s\n" - "ldr q5, [%[y_ptr], #0x30]\n" - "fmla v30.4s, v6.4s, v0.4s\n" - "ldr q6, [%[y_ptr], #0x40]\n" - "fmla v31.4s, v7.4s, v0.4s\n" - "ldr q7, [%[y_ptr], #0x50]\n" - - "fmla v2.4s, v8.4s, %[va].4s\n" - "ldr q8, [%[y_ptr], #0x60]\n" - "fmla v3.4s, v9.4s, %[va].4s\n" - "ldr q9, [%[y_ptr], #0x70]\n" - "fmla v4.4s, v10.4s, %[va].4s\n" - "ldr q10, [%[y_ptr], #0x80]\n" - "fmla v5.4s, v11.4s, %[va].4s\n" - "ldr q11, [%[y_ptr], #0x90]\n" - "fmla v6.4s, v12.4s, %[va].4s\n" - "ldr q12, [%[y_ptr], #0xa0]\n" - "str q2, [%[y_ptr], #0x00]\n" - "fmla v7.4s, v13.4s, %[va].4s\n" - "ldr q13, [%[y_ptr], #0xb0]\n" - "str q3, [%[y_ptr], #0x10]\n" - "fmla v8.4s, v14.4s, %[va].4s\n" - "ldr q14, [%[y_ptr], #0xc0]\n" - "str q4, [%[y_ptr], #0x20]\n" - "fmla v9.4s, v15.4s, %[va].4s\n" - "ldr q15, [%[y_ptr], #0xd0]\n" - "str q5, [%[y_ptr], #0x30]\n" - "fmla v10.4s, v16.4s, %[va].4s\n" - "ldr q16, [%[y_ptr], #0xe0]\n" - "str q6, [%[y_ptr], #0x40]\n" - "fmla v11.4s, v17.4s, %[va].4s\n" - "ldr q17, [%[y_ptr], #0xf0]\n" - "str q7, [%[y_ptr], #0x50]\n" - "fmla v12.4s, v18.4s, %[va].4s\n" - "ldr q18, [%[y_ptr], #0x100]\n" - "str q8, [%[y_ptr], #0x60]\n" - "fmla v13.4s, v19.4s, %[va].4s\n" - "ldr q19, [%[y_ptr], #0x110]\n" - "str q9, [%[y_ptr], #0x70]\n" - "fmla v14.4s, v20.4s, %[va].4s\n" - "ldr q20, [%[y_ptr], #0x120]\n" - "str q10, [%[y_ptr], #0x80]\n" - "fmla v15.4s, v21.4s, %[va].4s\n" - "ldr q21, [%[y_ptr], #0x130]\n" - "str q11, [%[y_ptr], #0x90]\n" - "fmla v16.4s, v22.4s, %[va].4s\n" - "ldr q22, [%[y_ptr], #0x140]\n" - "str q12, [%[y_ptr], #0xa0]\n" - "fmla v17.4s, v23.4s, %[va].4s\n" - "ldr q23, [%[y_ptr], #0x150]\n" - "str q13, [%[y_ptr], #0xb0]\n" - "fmla v18.4s, v24.4s, %[va].4s\n" - "ldr q24, [%[y_ptr], #0x160]\n" - "str q14, [%[y_ptr], #0xc0]\n" - "fmla v19.4s, v25.4s, %[va].4s\n" - "ldr q25, [%[y_ptr], #0x170]\n" - "str q15, [%[y_ptr], #0xd0]\n" - "fmla v20.4s, v26.4s, %[va].4s\n" - "str q16, [%[y_ptr], #0xe0]\n" - "fmla v21.4s, v27.4s, %[va].4s\n" - "str q17, [%[y_ptr], #0xf0]\n" - "fmla v22.4s, v28.4s, %[va].4s\n" - "str q18, [%[y_ptr], #0x100]\n" - "fmla v23.4s, v29.4s, %[va].4s\n" - "str q19, [%[y_ptr], #0x110]\n" - "fmla v24.4s, v30.4s, %[va].4s\n" - "str q20, [%[y_ptr], #0x120]\n" - "fmla v25.4s, v31.4s, %[va].4s\n" - "str q21, [%[y_ptr], #0x130]\n" - - "stp q22, q23, [%[y_ptr], #0x140]\n" - "stp q24, q25, [%[y_ptr], #0x160]\n" - "add %[y_ptr], %[y_ptr], #0x180\n" - - : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr) - : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit) - : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31", "cc" - ); - } - - if (N>0) { - // Handle N tail - up to 95 stragglers. - // This is 0-23 vectors, plus optionally an 64-bit vector and/or a - // single value for the remainder. - - // Independent pointers into the matrix for the odd 2 and odd 1. - // Double up as flag to indicate whether they are needed. - const float *odd2_aptr=NULL; - const float *odd1_aptr=NULL; - - // Figure out how much work we need to do. - int numvecs = N/4; - int rem = N%4; - int k=M; - - // Set up pointers for the odd 2/1 if needed. - if (rem >= 2) { - odd2_aptr = a_ptr_base + (numvecs * 4); - } - - if (rem & 1) { - odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2); - } - - const float *a_ptr = a_ptr_base; - const float *firstpf_ptr = a_ptr_base; - const float *pf_ptr = a_ptr_base; - const float *pf_limit = a_ptr + (M * lda); - - const float *x_ptr = Xstart; - int vecs=0; // Working variable to count how many vectors to work on. - int dopf=1; // Track whether we are doing prefetches. - - // Figure out how many cache lines we need to prefetch each time. - int numpfs = (N + 15) / 16; - - // Do initial prefetches - for (int i=0; i<firstpfd+1; i++) { - prefetch_1x(firstpf_ptr); - firstpf_ptr += lda; - } - - // Do "main" prefetches - adapt number to the number we actually need. - if (numpfs > 1) { - for (int i=0; i<pfd+1; i++) { - switch (numpfs) { - case 2: - prefetch_1x(pf_ptr + 16); - break; - - case 3: - prefetch_2x(pf_ptr + 16); - break; - - case 4: - prefetch_3x(pf_ptr + 16); - break; - - case 5: - prefetch_4x(pf_ptr + 16); - break; - - case 6: - prefetch_5x(pf_ptr + 16); - break; - } - pf_ptr += lda; - } - } else { - // Just disable additional prefetches - dopf=0; - } - - // Do the real work - __asm __volatile ( - // Initialize all the vectors - not worth skipping this if only - // some are needed. - "movi v8.4s,#0x0\n" - "ldr w0, [%[x_ptr]]\n" - "movi v9.4s,#0x0\n" - "movi v10.4s,#0x0\n" - "movi v11.4s,#0x0\n" - "movi v12.4s,#0x0\n" - "movi v13.4s,#0x0\n" - "movi v14.4s,#0x0\n" - "movi v15.4s,#0x0\n" - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v6.2s, #0x0\n" - "movi v5.2s, #0x0\n" - - "1:\n" - ASM_PREFETCH("[%[firstpf_ptr]]\n") - "11:\n" - "dup v0.4s, w0\n" - "ldr w0, [%[x_ptr], #4]\n" - "add %[x_ptr], %[x_ptr], #4\n" - - "cbz %w[numvecs], 2f\n" - "mov %w[vecs], %w[numvecs]\n" - - // Vector 0 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x00]\n" - "fmla v8.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 1 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x10]\n" - "fmla v9.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 2 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x20]\n" - "fmla v10.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 3 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x30]\n" - "fmla v11.4s, v7.4s, v0.4s\n" - // Prefetch - "cbz %w[dopf], 3f\n" - ASM_PREFETCH("[%[pf_ptr], #0x40]") - "3:\n" - "beq 2f\n" - - // Vector 4 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x40]\n" - "fmla v12.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 5 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x50]\n" - "fmla v13.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 6 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x60]\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 7 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x70]\n" - "fmla v15.4s, v7.4s, v0.4s\n" - // Prefetch - "cbz %w[dopf], 4f\n" - ASM_PREFETCH("[%[pf_ptr], #0x80]") - "4:\n" - "beq 2f\n" - - // Vector 8 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x80]\n" - "fmla v16.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 9 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x90]\n" - "fmla v17.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 10 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0xa0]\n" - "fmla v18.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 11 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0xb0]\n" - "fmla v19.4s, v7.4s, v0.4s\n" - // Prefetch - "cbz %w[dopf], 5f\n" - ASM_PREFETCH("[%[pf_ptr], #0xc0]") - "5:\n" - "beq 2f\n" - - // Vector 12 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0xc0]\n" - "fmla v20.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 13 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0xd0]\n" - "fmla v21.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 14 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0xe0]\n" - "fmla v22.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 15 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0xf0]\n" - "fmla v23.4s, v7.4s, v0.4s\n" - // Prefetch - "cbz %w[dopf], 6f\n" - ASM_PREFETCH("[%[pf_ptr], #0x100]") - "6:\n" - "beq 2f\n" - - // Vector 16 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x100]\n" - "fmla v24.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 17 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x110]\n" - "fmla v25.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 18 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x120]\n" - "fmla v26.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 19 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x130]\n" - "fmla v27.4s, v7.4s, v0.4s\n" - // Prefetch - "cbz %w[dopf], 7f\n" - ASM_PREFETCH("[%[pf_ptr], #0x140]") - "7:\n" - "beq 2f\n" - - // Vector 20 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x140]\n" - "fmla v28.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 21 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x150]\n" - "fmla v29.4s, v7.4s, v0.4s\n" - "beq 2f\n" - // Vector 22 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7,[%[a_ptr], #0x160]\n" - "fmla v30.4s, v7.4s, v0.4s\n" - - "2:\n" - "add %[a_ptr], %[a_ptr], %[jump]\n" - - // Do the odd 2-vector, if needed - "cbz %[odd2_aptr], 8f\n" - "ldr d7, [%[odd2_aptr]]\n" - "fmla v6.2s, v7.2s, v0.2s\n" - "add %[odd2_aptr], %[odd2_aptr], %[jump]\n" - - "8:\n" - // Do the odd 1-vector, if needed - "cbz %[odd1_aptr], 9f\n" - "ldr s7, [%[odd1_aptr]]\n" - "fmla v5.2s, v7.2s, v0.2s\n" - "add %[odd1_aptr], %[odd1_aptr], %[jump]\n" - - // Get out if needed. - "9:\n" - "subs %w[k], %w[k], #1\n" - "beq 10f\n" - - // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf" - "add %[pf_ptr], %[pf_ptr], %[jump]\n" - "cmp %[pf_ptr], %[pf_limit]\n" - "csel %w[dopf], %w[dopf], WZR, LT\n" - - // Update the "leading" prefetch pointer, don't do the first - // instruction of the loop if it's over the limit. - "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n" - "cmp %[firstpf_ptr], %[pf_limit]\n" - "blt 1b\n" - "b 11b\n" - - // Now write out the outputs - "10:\n" - "cbz %w[numvecs], 12f\n" - "mov %w[vecs], %w[numvecs]\n" - - // Vector 0 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v8.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 1 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v9.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 2 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v10.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 3 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v11.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 4 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v12.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 5 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v13.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 6 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v14.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 7 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v15.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 8 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v16.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 9 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v17.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 10 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v18.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 11 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v19.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 12 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v20.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 13 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v21.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 14 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v22.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 15 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v23.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 16 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v24.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 17 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v25.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 18 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v26.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 19 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v27.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 20 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v28.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 21 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v29.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - "beq 12f\n" - // Vector 22 - "subs %w[vecs], %w[vecs], #1\n" - "ldr q7, [%[y_ptr]]\n" - "fmla v7.4s, v30.4s, %[va].4s\n" - "str q7, [%[y_ptr]], #0x10\n" - - // Odd 2 - "12:\n" - "cbz %[odd2_aptr], 13f\n" - "ldr d7, [%[y_ptr]]\n" - "fmla v7.2s, v6.2s, %[va].2s\n" - "str d7, [%[y_ptr]], #0x8\n" - - // Odd 1 - "13:\n" - "cbz %[odd1_aptr], 14f\n" - "ldr s7, [%[y_ptr]]\n" - "fmla v7.2s, v5.2s, %[va].2s\n" - "str s7, [%[y_ptr]]\n" - - "14:\n" - : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), - [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr), - [odd1_aptr] "+r" (odd1_aptr), [odd2_aptr] "+r" (odd2_aptr), - [dopf] "+r" (dopf), [vecs] "+r" (vecs) - : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit), [numvecs] "r" (numvecs) - : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31", "cc" - ); - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp deleted file mode 100644 index 6731480fca..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -template<unsigned int width, unsigned int height, typename Tin, typename Tout> -void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) { - int full_y_blocks = (ymax - y0) / height; - int y_remainder = (ymax - y0) % height; - int y_blocks = full_y_blocks + (y_remainder ? 1 : 0); - - int full_x_blocks = (xmax - x0) / width; - int x_remainder = (xmax - x0) % width; - int x_blocks = full_x_blocks + (x_remainder ? 1 : 0); - - for (int y_block = 0; y_block < y_blocks; y_block++) { - int ybase = y0 + (y_block * height); - - int fill_rows = (y_block < full_y_blocks) ? height : y_remainder; - - for (int x_block = 0; x_block < x_blocks; x_block++) { - int xbase = x0 + (x_block * width); - - int fill_cols = (x_block < full_x_blocks) ? width : x_remainder; - - for (int row=0; row < fill_rows; row++) { - for (int col=0; col < fill_cols; col++) { - Tout &p = out[(ybase + row) * ldc + xbase + col]; - - p = (p * alpha) + (beta * in[row * width + col]); - } - } - - in += (width * height); - } - } -} - -#include "merges/list.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp deleted file mode 100644 index ddd67e8ee2..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include "../asmlib.hpp" - -#include <arm_neon.h> - -template<> -inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) { - const float *inptr = in; -// prefetch_6x(inptr); -// prefetch_6x(inptr + 96); - - float32x4_t av = vdupq_n_f32(alpha); - float32x4_t bv = vdupq_n_f32(beta); - - for (int y=y0; y<ymax; y+=8) { - float *outptr0 = out + (y * ldout) + x0; - float *outptr1 = outptr0 + ldout; - float *outptr2 = outptr1 + ldout; - float *outptr3 = outptr2 + ldout; - float *outptr4 = outptr3 + ldout; - float *outptr5 = outptr4 + ldout; - -// prefetch_2x(outptr0); -// prefetch_2x(outptr1); -// prefetch_2x(outptr2); -// prefetch_2x(outptr3); -// prefetch_2x(outptr4); -// prefetch_2x(outptr5); - - for (int i=x0; i<xmax; i+=8) { - float dummyres[8]; - - /* Make sure we throw away results if Y isn't a multiple of 8. - * We do this by pointing the result pointer at a dummy buffer - * we later discard. */ - if ((y+5) >= ymax) { - switch ((y + 5) - ymax) { - case 4: - outptr1 = dummyres; - case 3: - outptr2 = dummyres; - case 2: - outptr3 = dummyres; - case 1: - outptr4 = dummyres; - case 0: - outptr5 = dummyres; - default: - break; - } - } - - /* For ragged X, manually copy over the valid results. */ - if ((i+7) >= xmax) { - for (int xi=0; xi<8; xi++) { - if ((i+xi) < xmax) { - *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); - outptr0++; - *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta); - outptr1++; - *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta); - outptr2++; - *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta); - outptr3++; - *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta); - outptr4++; - *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta); - outptr5++; - } - } - inptr += 48; - } else { - /* Optimized routine to copy an entire block */ - __asm __volatile ( - // Rows 0-1 - "VLD1.32 {d8-d11}, [%[outptr0]]\n" - "VMUL.f32 q4, q4, %q[bv]\n" - "VLD1.32 {d12-d15}, [%[outptr1]]\n" - "VMUL.f32 q5, q5, %q[bv]\n" - "VLD1.32 {d0-d3}, [%[inptr]]!\n" - "VMUL.f32 q6, q6, %q[bv]\n" - "VLD1.32 {d4-d7}, [%[inptr]]!\n" - "VMUL.f32 q7, q7, %q[bv]\n" - - "VMLA.f32 q4, q0, %q[av]\n" - ASM_PREFETCH("[%[inptr], #352]") - "VMLA.f32 q5, q1, %q[av]\n" - "VST1.32 {d8-d11}, [%[outptr0]]!\n" - ASM_PREFETCH("[%[inptr], #416]") - "VMLA.f32 q6, q2, %q[av]\n" - ASM_PREFETCH("[%[inptr], #480]") - "VMLA.f32 q7, q3, %q[av]\n" - "VST1.32 {d12-d15}, [%[outptr1]]!\n" - - // Rows 2-3 - "VLD1.32 {d8-d11}, [%[outptr2]]\n" - "VMUL.f32 q4, q4, %q[bv]\n" - "VLD1.32 {d12-d15}, [%[outptr3]]\n" - "VMUL.f32 q5, q5, %q[bv]\n" - "VLD1.32 {d0-d3}, [%[inptr]]!\n" - "VMUL.f32 q6, q6, %q[bv]\n" - "VLD1.32 {d4-d7}, [%[inptr]]!\n" - "VMUL.f32 q7, q7, %q[bv]\n" - - "VMLA.f32 q4, q0, %q[av]\n" - ASM_PREFETCH("[%[outptr0], #96]") - "VMLA.f32 q5, q1, %q[av]\n" - "VST1.32 {d8-d11}, [%[outptr2]]!\n" - ASM_PREFETCH("[%[outptr1], #96]") - "VMLA.f32 q6, q2, %q[av]\n" - ASM_PREFETCH("[%[outptr2], #96]") - "VMLA.f32 q7, q3, %q[av]\n" - "VST1.32 {d12-d15}, [%[outptr3]]!\n" - - // Rows 4-5 - "VLD1.32 {d8-d11}, [%[outptr4]]\n" - "VMUL.f32 q4, q4, %q[bv]\n" - "VLD1.32 {d12-d15}, [%[outptr5]]\n" - "VMUL.f32 q5, q5, %q[bv]\n" - "VLD1.32 {d0-d3}, [%[inptr]]!\n" - "VMUL.f32 q6, q6, %q[bv]\n" - "VLD1.32 {d4-d7}, [%[inptr]]!\n" - "VMUL.f32 q7, q7, %q[bv]\n" - - "VMLA.f32 q4, q0, %q[av]\n" - ASM_PREFETCH("[%[outptr3], #96]") - "VMLA.f32 q5, q1, %q[av]\n" - "VST1.32 {d8-d11}, [%[outptr4]]!\n" - ASM_PREFETCH("[%[outptr4], #96]") - "VMLA.f32 q6, q2, %q[av]\n" - ASM_PREFETCH("[%[outptr5], #128]") - "VMLA.f32 q7, q3, %q[av]\n" - "VST1.32 {d12-d15}, [%[outptr5]]!\n" - : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), - [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr) - : [av] "w" (av), [bv] "w" (bv) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - } - } - } -} - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp deleted file mode 100644 index e8edddb4f4..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../asmlib.hpp" - -template<> -inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) { - const float *inptr = in; - prefetch_6x(inptr); - prefetch_6x(inptr + 96); - - float32x4_t av = vdupq_n_f32(alpha); - float32x4_t bv = vdupq_n_f32(beta); - - for (int y=y0; y<ymax; y+=8) { - float *outptr0 = out + (y * ldout) + x0; - float *outptr1 = outptr0 + ldout; - float *outptr2 = outptr1 + ldout; - float *outptr3 = outptr2 + ldout; - float *outptr4 = outptr3 + ldout; - float *outptr5 = outptr4 + ldout; - float *outptr6 = outptr5 + ldout; - float *outptr7 = outptr6 + ldout; - - prefetch_2x(outptr0); - prefetch_2x(outptr1); - prefetch_2x(outptr2); - prefetch_2x(outptr3); - prefetch_2x(outptr4); - prefetch_2x(outptr5); - prefetch_2x(outptr6); - prefetch_2x(outptr7); - - for (int i=x0; i<xmax; i+=12) { - float dummyres[12]; - - /* Make sure we throw away results if Y isn't a multiple of 8. - * We do this by pointing the result pointer at a dummy buffer - * we later discard. */ - if ((y+7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - outptr1 = dummyres; - case 5: - outptr2 = dummyres; - case 4: - outptr3 = dummyres; - case 3: - outptr4 = dummyres; - case 2: - outptr5 = dummyres; - case 1: - outptr6 = dummyres; - case 0: - outptr7 = dummyres; - default: - break; - } - } - - /* For ragged X, manually copy over the valid results. */ - if ((i+11) >= xmax) { - for (int xi=0; xi<12; xi++) { - if ((i+xi) < xmax) { - *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); - outptr0++; - *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); - outptr1++; - *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); - outptr2++; - *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); - outptr3++; - *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); - outptr4++; - *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); - outptr5++; - *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); - outptr6++; - *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); - outptr7++; - } - } - inptr += 96; - } else { - /* Optimized routine to copy an entire block */ - __asm __volatile ( - // Rows 0-1 - "LDP q16, q17, [%[outptr0]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr0], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr1]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr1], #32]\n" - ASM_PREFETCH("[%[inptr], #768]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr]]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #32]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #64]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #832]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr0]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr0]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #896]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr1]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr1]], #16\n" - - // Rows 2-3 - "LDP q16, q17, [%[outptr2]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr2], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr3]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr3], #32]\n" - ASM_PREFETCH("[%[inptr], #960]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr], #96]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #128]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #160]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #1024]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr2]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr2]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #1088]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr3]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr3]], #16\n" - - // Rows 4-5 - ASM_PREFETCH("[%[outptr0], #80]") - "LDP q16, q17, [%[outptr4]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr4], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr5]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr5], #32]\n" - ASM_PREFETCH("[%[outptr1], #80]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr], #192]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #224]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #256]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr2], #80]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr4]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr4]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr3], #80]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr5]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr5]], #16\n" - - // Rows 6-7 - ASM_PREFETCH("[%[outptr4], #80]") - "LDP q16, q17, [%[outptr6]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr6], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr7]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr7], #32]\n" - ASM_PREFETCH("[%[outptr5], #80]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr], #288]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #320]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #352]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr6], #128]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr6]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr6]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr7], #128]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr7]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr7]], #16\n" - "ADD %[inptr], %[inptr], #384\n" - : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), - [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), - [inptr] "+r" (inptr) - : [av] "w" (av), [bv] "w" (bv) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21" - ); - } - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp new file mode 100644 index 0000000000..b7cc3d773b --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include <fstream> +#include <iostream> +#include <regex> +#include <sstream> +#include <thread> + +extern int l1_cache_size; +extern int l2_cache_size; +extern int force_cpu; + +#ifdef __ANDROID__ +inline unsigned long stoul( const std::string& str, std::size_t* pos = 0, int base = 10 ) +{ + char *end; + const unsigned long ret = strtoul( str.c_str(), &end, base); + *pos = end - str.c_str(); + return ret; +} +inline int stoi( const std::string& str, std::size_t* pos = 0, int base = 10 ) +{ + return atoi(str.c_str()); +} +#endif + + +#ifndef BARE_METAL +#include <sys/auxv.h> + +/* Get HWCAP bits from asm/hwcap.h */ +#include <asm/hwcap.h> +#endif /* !BARE_METAL */ + +/* Make sure the bits we care about are defined, just in case asm/hwcap.h is + * out of date (or for bare metal mode) */ +#ifndef HWCAP_ASIMDHP +#define HWCAP_ASIMDHP (1 << 10) +#endif + +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif + +#ifndef HWCAP_ASIMDDP +#define HWCAP_ASIMDDP (1 << 20) +#endif + +#define CPUINFO_HACK + +//unsigned int get_cpu_impl(); + + +/* CPU models - we only need to detect CPUs we have + * microarchitecture-specific code for. + * + * Architecture features are detected via HWCAPs. + */ +enum class CPUModel { + GENERIC = 0x0001, + A53 = 0x0010, + A55r0 = 0x0011, + A55r1 = 0x0012, +}; + +class CPUInfo +{ +private: + struct PerCPUData { + CPUModel model = CPUModel::GENERIC; + uint32_t midr = 0; + bool model_set = false; + }; + + std::vector<PerCPUData> _percpu={}; + + bool _cpuid = false; + bool _fp16 = false; + bool _dotprod = false; + + unsigned int L1_cache_size = 32768; + unsigned int L2_cache_size = 262144; + + /* Convert an MIDR register value to a CPUModel enum value. */ + CPUModel midr_to_model(const unsigned int midr) const { + CPUModel model; + + // Unpack variant and CPU ID + int variant = (midr >> 20) & 0xF; + int cpunum = (midr >> 4) & 0xFFF; + + /* Only CPUs we have code paths for are detected. All other CPUs + * can be safely classed as "GENERIC" + */ + + switch(cpunum) { + case 0xd03: + model = CPUModel::A53; + break; + + case 0xd05: + if (variant) { + model = CPUModel::A55r1; + } else { + model = CPUModel::A55r0; + } + break; + + default: + model = CPUModel::GENERIC; + break; + } + + return model; + } + + /* If the CPUID capability is present, MIDR information is provided in + /sys. Use that to populate the CPU model table. */ + void populate_models_cpuid() { + for (unsigned long int i=0; i<_percpu.size(); i++) { + std::stringstream str; + str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1"; + std::ifstream file; + + file.open(str.str(), std::ios::in); + + if (file.is_open()) { + std::string line; + + if (bool(getline(file, line))) { + const unsigned long midr = stoul(line, nullptr, 16); + + _percpu[i].midr = (midr & 0xffffffff); + _percpu[i].model = midr_to_model(_percpu[i].midr); + _percpu[i].model_set = true; + } + } + } + } + + /* If "long-form" cpuinfo is present, parse that to populate models. */ + void populate_models_cpuinfo() { + std::regex proc_regex("^processor.*(\\d+)$"); + std::regex imp_regex("^CPU implementer.*0x(..)$"); + std::regex var_regex("^CPU variant.*0x(.)$"); + std::regex part_regex("^CPU part.*0x(...)$"); + std::regex rev_regex("^CPU revision.*(\\d+)$"); + + std::ifstream file; + file.open("/proc/cpuinfo", std::ios::in); + + if (file.is_open()) { + std::string line; + int midr=0; + int curcpu=-1; + + while(bool(getline(file, line))) { + std::smatch match; + + if (std::regex_match(line, match, proc_regex)) { + std::string id = match[1]; + int newcpu=stoi(id, nullptr, 0); + + if (curcpu >= 0 && midr==0) { + // Matched a new CPU ID without any description of the previous one - looks like old format. + return; + } + + if (curcpu >= 0) { + _percpu[curcpu].midr = midr; + _percpu[curcpu].model = midr_to_model(midr); + _percpu[curcpu].model_set = true; + + printf("CPU %d: %x\n",curcpu,midr); + } + + midr=0; + curcpu=newcpu; + + continue; + } + + if (std::regex_match(line, match, imp_regex)) { + int impv = stoi(match[1], nullptr, 16); + midr |= (impv << 24); + continue; + } + + if (std::regex_match(line, match, var_regex)) { + int varv = stoi(match[1], nullptr, 16); + midr |= (varv << 16); + continue; + } + + if (std::regex_match(line, match, part_regex)) { + int partv = stoi(match[1], nullptr, 16); + midr |= (partv << 4); + continue; + } + + if (std::regex_match(line, match, rev_regex)) { + int regv = stoi(match[1], nullptr, 10); + midr |= (regv); + midr |= (0xf << 16); + continue; + } + } + + if (curcpu >= 0) { + _percpu[curcpu].midr = midr; + _percpu[curcpu].model = midr_to_model(midr); + _percpu[curcpu].model_set = true; + + printf("CPU %d: %x\n",curcpu,midr); + } + } + } + + /* Identify the maximum valid CPUID in the system. This reads + * /sys/devices/system/cpu/present to get the information. */ + int get_max_cpus() { + int max_cpus = 1; + +#ifndef BARE_METAL + std::ifstream CPUspresent; + CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in); + bool success = false; + + if (CPUspresent.is_open()) { + std::string line; + + if (bool(getline(CPUspresent, line))) { + /* The content of this file is a list of ranges or single values, e.g. + * 0-5, or 1-3,5,7 or similar. As we are interested in the + * max valid ID, we just need to find the last valid + * delimiter ('-' or ',') and parse the integer immediately after that. + */ + auto startfrom=line.begin(); + + for (auto i=line.begin(); i<line.end(); ++i) { + if (*i=='-' || *i==',') { + startfrom=i+1; + } + } + + line.erase(line.begin(), startfrom); + + max_cpus = stoi(line, nullptr, 0) + 1; + success = true; + } + } + + // Return std::thread::hardware_concurrency() as a fallback. + if (!success) { + max_cpus = std::thread::hardware_concurrency(); + } +#endif // !BARE_METAL + + return max_cpus; + } + +public: + CPUInfo() { +#ifndef BARE_METAL + unsigned long hwcaps = getauxval(AT_HWCAP); + + if (hwcaps & HWCAP_CPUID) { + _cpuid = true; + } + + if (hwcaps & HWCAP_ASIMDHP) { + _fp16 = true; + } + + if (hwcaps & HWCAP_ASIMDDP) { + _dotprod = true; + } + +#ifdef __aarch64__ + /* Pre-4.15 kernels don't have the ASIMDDP bit. + * + * Although the CPUID bit allows us to read the feature register + * directly, the kernel quite sensibly masks this to only show + * features known by it to be safe to show to userspace. As a + * result, pre-4.15 kernels won't show the relevant bit in the + * feature registers either. + * + * So for now, use a whitelist of CPUs known to support the feature. + */ + if (!_dotprod && _cpuid) { + /* List of CPUs with dot product support: A55r1 A75r1 A75r2 */ + const unsigned int dotprod_whitelist_masks[] = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 }; + const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 }; + + unsigned long cpuid; + + __asm __volatile ( + "mrs %0, midr_el1\n" + : "=r" (cpuid) + : + : + ); + + for (int i=0;dotprod_whitelist_values[i];i++) { + if ((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i]) { + _dotprod = true; + break; + } + } + } +#endif + _percpu.resize(get_max_cpus()); +#endif + if (_cpuid) { + populate_models_cpuid(); + } else { + populate_models_cpuinfo(); + } + } + + void set_fp16(const bool fp16) { + _fp16 = fp16; + } + + void set_dotprod(const bool dotprod) { + _dotprod = dotprod; + } + + void set_cpu_model(unsigned long cpuid, CPUModel model) { + if (_percpu.size() > cpuid) { + _percpu[cpuid].model = model; + _percpu[cpuid].model_set = true; + } + } + + bool has_fp16() const { + return _fp16; + } + + bool has_dotprod() const { + return _dotprod; + } + + CPUModel get_cpu_model(unsigned long cpuid) const { + if (cpuid < _percpu.size()) { + return _percpu[cpuid].model; + } + + return CPUModel::GENERIC; + } + + CPUModel get_cpu_model() const { +#ifdef BARE_METAL + return get_cpu_model(0); +#else + return get_cpu_model(sched_getcpu()); +#endif + } + + unsigned int get_L1_cache_size() const { + return L1_cache_size; + } + + void set_L1_cache_size(unsigned int size) { + L1_cache_size = size; + } + + unsigned int get_L2_cache_size() const { + return L2_cache_size; + } + + void set_L2_cache_size(unsigned int size) { + L2_cache_size = size; + } +}; + +CPUInfo *get_CPUInfo(); diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp deleted file mode 100644 index f7a1d1c70c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef CYCLE_PROFILING - -#include "../perf.h" - -class profiler { -private: - static const int maxevents = 10000; - unsigned long times[maxevents]; - unsigned long units[maxevents]; - int events[maxevents]; - int currentevent; - int countfd; - -public: - profiler() { - currentevent=0; - countfd=open_cycle_counter(); - } - - ~profiler() { - close(countfd); - int tots[5]; - unsigned long counts[5]; - unsigned long tunits[5]; - const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" }; - - for (int i=1; i<5; i++) { - tots[i] = 0; - counts[i] = 0; - tunits[i] = 0; - } - - printf("Profiled events:\n"); - for (int i=0; i<currentevent; i++) { - tots[events[i]]++; - counts[events[i]] += times[i]; - tunits[events[i]] += units[i]; - } - - printf("%20s %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle"); - for (int i=1; i<5; i++) { - printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]); - } - } - - template <typename T> - void operator() (int i, unsigned long u, T func) { - if (currentevent==maxevents) { - func(); - } else { - events[currentevent] = i; - units[currentevent] = u; - start_counter(countfd); - func(); - long long cycs = stop_counter(countfd); - times[currentevent++] = cycs; - } - } -}; - -#else - -class profiler { -public: - template <typename T> - void operator() (int i, unsigned long u, T func) { - func(); - } -}; - -#endif - -#define PROFILE_PREPA 1 -#define PROFILE_PREPB 2 -#define PROFILE_KERNEL 3 -#define PROFILE_MERGE 4 - - diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp deleted file mode 100644 index 717506f54c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transform.hpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -/* - * Generic transform. - * - * Assuming the untransposed case, this works by first reading <BlockBy> - * consecutive values from the first input row. This same number of values - * are then read from the next <IntBy-1> rows. Now return to the first - * input row and repeat. - * - * Need to cope with the work requested in either dimension not actually - * being a multiple of the block sizes. - */ -template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize> -struct TransformImpl { - template <typename TOut, typename TIn> - static void Transform(TOut* out, const TIn* const in, const int stride, - const int y0, const int ymax, const int x0, const int xmax) { - const int n_whole_y_blocks = (ymax - y0) / IntBy; - const int y_remainders = (ymax - y0) % IntBy; - const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0); - - const int n_whole_x_blocks = (xmax - x0) / BlockBy; - const int x_remainders = (xmax - x0) % BlockBy; - const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0); - - // "Y" loop: advance down the rows of the source IntBy rows at a time. - // Set up fill_rows to show the number rows to copy from, and blank_rows - // for the number of blank rows to add. - for (int y_block=0 ; y_block < n_y_blocks; y_block++) { - int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders; - int blank_rows = IntBy - fill_rows; - - int y_base = y0 + (y_block * IntBy); - - // So now advance along this block of rows, BlockBy columns at a time. - for (int x_block=0 ; x_block < n_x_blocks; x_block++) { - int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders; - int blank_cols = BlockBy - fill_cols; - - int x_base = x0 + (x_block * BlockBy); - - for (int row = 0; row < fill_rows; row++) { - for (int col = 0; col < fill_cols; col++) { - // In-range copy. If it's transposed, we reverse the sense of rows and columns here. - if (Transposed) { - *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]); - } else { - *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]); - } - } - // "col" tail - row is in range but column is out of range. - for (int col=0; col < blank_cols; col++) { - *out++ = static_cast<TOut>(0); - } - } - // "row" tail - row is out of range so fill with zeros always. - for (int row = 0; row < blank_rows; row++) { - for (int col=0; col < (fill_cols + blank_cols); col++) { - *out++ = static_cast<TOut>(0); - } - } - } - } - } - - template <typename T> - static inline void Transform(T* out, const T* const in, const int stride, - const int k0, const int kmax, const int x0, const int xmax) { - Transform<T, T>(out, in, stride, k0, kmax, x0, xmax); - } -}; - -/*****************************************************************************/ -template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn> -void Transform( - TOut* out, const TIn* const in, const int stride, - const int k0, const int kmax, const int x0, const int xmax -) { - // Redirect to a specialised implementation predicated on argument size. - TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform( - out, in, stride, k0, kmax, x0, xmax - ); -} -/*****************************************************************************/ - -#include "transforms/list.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp deleted file mode 100644 index 4a1b5d2bf2..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include <arm_neon.h> -#include "asmlib.hpp" - -template<> -template<typename T> -inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint32_t *outptr = reinterpret_cast<uint32_t *>(out); - const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in); - - uint32_t zerobuff[8]; - - for (int y=y0; y<ymax; y+=6) { - const uint32_t *inptr0 = inptr + y * ldin + k0; - const uint32_t *inptr1 = inptr0 + ldin; - const uint32_t *inptr2 = inptr1 + ldin; - const uint32_t *inptr3 = inptr2 + ldin; - const uint32_t *inptr4 = inptr3 + ldin; - const uint32_t *inptr5 = inptr4 + ldin; - - //prefetch_2x(inptr0); - //prefetch_2x(inptr1); - //prefetch_2x(inptr2); - //prefetch_2x(inptr3); - //prefetch_2x(inptr4); - //prefetch_2x(inptr5); - - int x=(kmax-k0); - for (;x>7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 5) >= ymax) { - switch ((y + 5) - ymax) { - /* Everything falls through in here */ - case 4: - inptr1 = zerobuff; - case 3: - inptr2 = zerobuff; - case 2: - inptr3 = zerobuff; - case 1: - inptr4 = zerobuff; - case 0: - inptr5 = zerobuff; - default: - break; - } - } - - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 - "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 - "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 - "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 - "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 - "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 - "VLD1.32 {d16-d19}, [%[inptr4]]!\n" - "VLD1.32 {d20-d23}, [%[inptr5]]!\n" - "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 - ASM_PREFETCH("[%[inptr0], #128]") - "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 - - // Store first elements - "VST1.32 {d0-d1}, [%[outptr]]!\n" - "VST1.32 {d16}, [%[outptr]]!\n" - - "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 - - // Store second elements - "VST1.32 {d4-d5}, [%[outptr]]!\n" - "VZIP.32 q1, q5\n" - ASM_PREFETCH("[%[inptr1], #128]") - "VST1.32 {d17}, [%[outptr]]!\n" - "VZIP.32 q3, q7\n" - - // Store third elements - "VZIP.32 q9, q11\n" - "VST1.32 {d8-d9}, [%[outptr]]!\n" - "VZIP.32 q1, q3\n" - ASM_PREFETCH("[%[inptr2], #128]") - "VST1.32 {d20}, [%[outptr]]!\n" - - // Store fourth elements - "VZIP.32 q5, q7\n" - "VST1.32 {d12-d13}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr3], #128]") - "VST1.32 {d21}, [%[outptr]]!\n" - - // Fifth - "VST1.32 {d2-d3}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr4], #128]") - "VST1.32 {d18}, [%[outptr]]!\n" - - // Sixth - "VST1.32 {d6-d7}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr5], #128]") - "VST1.32 {d19}, [%[outptr]]!\n" - - // Seventh - "VST1.32 {d10-d11}, [%[outptr]]!\n" - "VST1.32 {d22}, [%[outptr]]!\n" - - // Eigth - "VST1.32 {d14-d15}, [%[outptr]]!\n" - "VST1.32 {d23}, [%[outptr]]!\n" - - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - } - } -} - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp deleted file mode 100644 index a7e17fa074..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 8x32-bit sized specialisation -template <> -template <typename T> -inline void TransformImpl<8, 1, true, 4, 4>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 16x uint16_t specialisation - TransformImpl<16, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t * const>(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 12x16-bit sized specialisation -template <> -template <typename T> -inline void TransformImpl<16, 1, true, 2, 2>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t * const>(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 16 x uint16_t version -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm volatile ( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r" (in0), - [out] "+r" (out) - : - : "q0", "q1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { - __asm volatile ( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in0], #192]") - "VLD1.32 {d0-d3}, [%[in1]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" - ASM_PREFETCH("[%[in1], #192]") - "SUB %[out], %[out], #32\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [out] "+r" (out) - : - : "q0", "q1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in0], #192]") - "VLD1.32 {d0-d3}, [%[in1]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in1], #192]") - "VLD1.32 {d0-d3}, [%[in2]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in2], #192]") - "VLD1.32 {d0-d3}, [%[in3]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" - ASM_PREFETCH("[%[in3], #192]") - "SUB %[out], %[out], #96\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [in2] "+r" (in2), - [in3] "+r" (in3), - [out] "+r" (out) - : - : "q0", "q1", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<16, 1, true, 2, 2>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp deleted file mode 100644 index ac84567b54..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "asmlib.hpp" - -template<> -template<typename T> -inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint8_t *outptr = (uint8_t *)out; - const uint8_t *inptr = (uint8_t *)in; - - uint8_t zerobuff[16]; - - for (int y=y0; y<ymax; y+=4) { - const uint8_t *inptr0 = inptr + y * ldin + k0; - const uint8_t *inptr1 = inptr0 + ldin; - const uint8_t *inptr2 = inptr1 + ldin; - const uint8_t *inptr3 = inptr2 + ldin; - - prefetch_2x(inptr0); - prefetch_2x(inptr1); - prefetch_2x(inptr2); - prefetch_2x(inptr3); - - int x=(kmax-k0); - for (;x>15;x-=16) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - /* Everything falls through in here */ - case 2: - inptr1 = zerobuff; - case 1: - inptr2 = zerobuff; - case 0: - inptr3 = zerobuff; - default: - break; - } - } - - __asm __volatile ( - "LDR q0, [%[inptr0]], #16\n" - ASM_PREFETCH("[%[inptr0], #176]") - "LDR q1, [%[inptr1]], #16\n" - ASM_PREFETCH("[%[inptr1], #176]") - "STP q0, q1, [%[outptr]], #32\n" - "LDR q0, [%[inptr2]], #16\n" - ASM_PREFETCH("[%[inptr2], #176]") - "LDR q1, [%[inptr3]], #16\n" - ASM_PREFETCH("[%[inptr3], #176]") - "STP q0, q1, [%[outptr]], #32\n" - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [outptr] "+r" (outptr) - : - : "v0", "v1" - ); - } - - if (x>0) { - /* Need to duplicate this here, in case we didn't run the main loop. */ - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - /* Everything falls through in here */ - case 2: - inptr1 = zerobuff; - case 1: - inptr2 = zerobuff; - case 0: - inptr3 = zerobuff; - default: - break; - } - } - - /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */ - auto f = [&outptr, x](const uint8_t *&p) { - for (int i=0; i<16; i++) { - if (i < x) { - *outptr++ = *p++; - } else { - *outptr++ = 0; - } - } - }; - - f(inptr0); - f(inptr1); - f(inptr2); - f(inptr3); - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp deleted file mode 100644 index bdc05473b4..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "asmlib.hpp" - -template<> -template<typename T> -void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint16_t *outptr = (uint16_t *)out; - const uint16_t *inptr = (const uint16_t *)in; - - uint16_t zerobuff[24]; - - for (int y=y0; y<ymax; y+=8) { - const uint16_t *inptr0 = inptr + y * ldin + k0; - const uint16_t *inptr1 = inptr0 + ldin; - const uint16_t *inptr2 = inptr1 + ldin; - const uint16_t *inptr3 = inptr2 + ldin; - const uint16_t *inptr4 = inptr3 + ldin; - const uint16_t *inptr5 = inptr4 + ldin; - const uint16_t *inptr6 = inptr5 + ldin; - const uint16_t *inptr7 = inptr6 + ldin; - - prefetch_2x(inptr0); - prefetch_2x(inptr1); - prefetch_2x(inptr2); - prefetch_2x(inptr3); - prefetch_2x(inptr4); - prefetch_2x(inptr5); - prefetch_2x(inptr6); - prefetch_2x(inptr7); - - int x=(kmax-k0); - for (;x>7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - /* Everything falls through in here */ - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - } - } - - int skippf = (x & 31); - __asm __volatile ( - // Load up 8 elements (1 vector) from each of 8 sources. - "CBNZ %w[skippf], 1f\n" - ASM_PREFETCH("[%[inptr0], #128]") - ASM_PREFETCH("[%[inptr1], #128]") - ASM_PREFETCH("[%[inptr2], #128]") - ASM_PREFETCH("[%[inptr3], #128]") - "1:\n" - - "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 - "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 - "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... - "LDR q6, [%[inptr6]], #16\n" - "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 - "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 - "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 - "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 - "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 - "LDR q5, [%[inptr5]], #16\n" - "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... - "LDR q7, [%[inptr7]], #16\n" - "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 - "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 - "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 - "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 - - "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 - "ZIP2 v20.8h, v8.8h, v9.8h\n" - "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 - "ZIP2 v21.8h, v10.8h, v11.8h\n" - - "CBNZ %w[skippf], 2f\n" - ASM_PREFETCH("[%[inptr4], #112]") - ASM_PREFETCH("[%[inptr5], #112]") - ASM_PREFETCH("[%[inptr6], #112]") - ASM_PREFETCH("[%[inptr7], #112]") - "2:\n" - - "ZIP1 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v30.8h, v16.8h, v17.8h\n" - "ZIP1 v23.8h, v18.8h, v19.8h\n" - "ZIP2 v31.8h, v18.8h, v19.8h\n" - - "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 - "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 - "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements - - "ZIP1 v0.8h, v20.8h, v21.8h\n" - "ZIP2 v1.8h, v20.8h, v21.8h\n" - "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v2.8h, v22.8h, v23.8h\n" - "ZIP2 v3.8h, v22.8h, v23.8h\n" - "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v4.8h, v30.8h, v31.8h\n" - "ZIP2 v5.8h, v30.8h, v31.8h\n" - "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : [skippf] "r" (skippf) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp deleted file mode 100644 index bd5125afab..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include <arm_neon.h> -#include "asmlib.hpp" - -template<> -template<typename T> -inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint32_t *outptr = (uint32_t *)out; - const uint32_t *inptr = (uint32_t *)in; - - uint32_t zerobuff[8]; - - for (int y=y0; y<ymax; y+=8) { - const uint32_t *inptr0 = inptr + y * ldin + k0; - const uint32_t *inptr1 = inptr0 + ldin; - const uint32_t *inptr2 = inptr1 + ldin; - const uint32_t *inptr3 = inptr2 + ldin; - const uint32_t *inptr4 = inptr3 + ldin; - const uint32_t *inptr5 = inptr4 + ldin; - const uint32_t *inptr6 = inptr5 + ldin; - const uint32_t *inptr7 = inptr6 + ldin; - - prefetch_2x(inptr0); - prefetch_2x(inptr1); - prefetch_2x(inptr2); - prefetch_2x(inptr3); - prefetch_2x(inptr4); - prefetch_2x(inptr5); - prefetch_2x(inptr6); - prefetch_2x(inptr7); - - int x=(kmax-k0); - for (;x>7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - /* Everything falls through in here */ - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 - "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 - "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDP q8, q9, [%[inptr4]], #32\n" - "LDP q10, q11, [%[inptr5]], #32\n" - "LDP q12, q13, [%[inptr6]], #32\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDP q14, q15, [%[inptr7]], #32\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP1 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - ASM_PREFETCH("[%[inptr7], #128]") - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp deleted file mode 100644 index 3c9e05223d..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - -#include <arm_neon.h> -#include "asmlib.hpp" - -template<> -template<> -inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) { - float *outptr = out; - const __fp16 *inptr = in; - - __fp16 zerobuff[8]; - - for (int y=y0; y<ymax; y+=8) { - const __fp16 *inptr0 = inptr + y * ldin + k0; - const __fp16 *inptr1 = inptr0 + ldin; - const __fp16 *inptr2 = inptr1 + ldin; - const __fp16 *inptr3 = inptr2 + ldin; - const __fp16 *inptr4 = inptr3 + ldin; - const __fp16 *inptr5 = inptr4 + ldin; - const __fp16 *inptr6 = inptr5 + ldin; - const __fp16 *inptr7 = inptr6 + ldin; - - prefetch_2x(inptr0); - prefetch_2x(inptr1); - prefetch_2x(inptr2); - prefetch_2x(inptr3); - prefetch_2x(inptr4); - prefetch_2x(inptr5); - prefetch_2x(inptr6); - prefetch_2x(inptr7); - - int x=(kmax-k0); - for (;x>7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - /* Everything falls through in here */ - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDR q0, [%[inptr0]], #16\n" - "LDR q2, [%[inptr1]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 - "FCVTL2 v3.4s, v2.8h\n" - "FCVTL v2.4s, v2.4h\n" - "FCVTL2 v5.4s, v4.8h\n" - "FCVTL v4.4s, v4.4h\n" - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDR q8, [%[inptr4]], #16\n" - "LDR q10, [%[inptr5]], #16\n" - "FCVTL2 v9.4s, v8.8h\n" - "FCVTL v8.4s, v8.4h\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDR q12, [%[inptr6]], #16\n" - "FCVTL2 v11.4s, v10.8h\n" - "FCVTL v10.4s, v10.4h\n" - "FCVTL2 v13.4s, v12.8h\n" - "FCVTL v12.4s, v12.4h\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - "LDR q14, [%[inptr7]], #16\n" - "FCVTL2 v15.4s, v14.8h\n" - "FCVTL v14.4s, v14.4h\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - "ZIP1 v17.4s, v3.4s, v7.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - ASM_PREFETCH("[%[inptr7], #128]") - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp deleted file mode 100644 index 6e07064a0c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 6x32-bit sized specialisation -template <> -template <typename T> -inline void TransformImpl<6, 1, true, 4, 4>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 12 x uint16_t specialisation - TransformImpl<12, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t * const>(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 12x16-bit sized specialisation -template <> -template <typename T> -inline void TransformImpl<12, 1, true, 2, 2>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t * const>(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 12 x uint16_t version -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR d1, [%[in0], #0x10]\n" - "STR d1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" - ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r" (in0), - [out] "+r" (out) - : - : "v0", "v1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "LDR d1, [%[in0], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" - ASM_PREFETCH("[%[in0], #192]") - - "LDR x21, [%[in1]]\n" - "LDR q2, [%[in1], #0x08]\n" - "INS v1.d[1], x21\n" - "ADD %x[in1], %x[in1], #0x18\n" - "STP q0, q1, [%[out]]\n" - "STR q2, [%x[out], #0x20]\n" - ASM_PREFETCH("[%[in1], #192]") - : [in0] "+r" (in0), - [in1] "+r" (in1), - [out] "+r" (out) - : - : "x21", "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "LDR q0, [%x[in0]], #0x10\n" - "STR q0, [%x[out]]\n" - "LDR d1, [%x[in0]], #0x08\n" - ASM_PREFETCH("[%[in0], #192]") - "STR d1, [%x[out], #0x10]\n" - - "LDR q0, [%x[in1]], #0x10\n" - "STR q0, [%x[out], #0x18]\n" - "LDR d1, [%x[in1]], #0x08\n" - ASM_PREFETCH("[%[in1], #192]") - "STR d1, [%x[out], #0x28]\n" - - "LDR q0, [%x[in2]], #0x10\n" - "STR q0, [%x[out], #0x30]\n" - "LDR d1, [%x[in2]], #0x08\n" - ASM_PREFETCH("[%[in2], #192]") - "STR d1, [%x[out], #0x40]\n" - - "LDR q0, [%x[in3]], #0x10\n" - "STR q0, [%x[out], #0x48]\n" - "LDR d1, [%x[in3]], #0x08\n" - ASM_PREFETCH("[%[in3], #192]") - "STR d1, [%x[out], #0x58]\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [in2] "+r" (in2), - [in3] "+r" (in3), - [out] "+r" (out) - : - : "v0", "v1", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<12, 1, true, 2, 2>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp deleted file mode 100644 index 835e4d87aa..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - -#include "transpose_interleave_common.hpp" - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "STR q2, [%[out], #32]\n" - : [in0] "+r" (in0), [out] "+r" (out) - : - : "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" - ASM_PREFETCH("[%[in1], #192]") - "LDR d5, [%[in1]], #16\n" - "FCVTL v5.4s, v5.4h\n" - "STP q4, q5, [%[out], #64]\n" - : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - "LDR d2, [%[in0]], #8\n" - ASM_PREFETCH("[%[in0], #192]") - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" - "LDR d5, [%[in1]], #8\n" - "FCVTL v5.4s, v5.4h\n" - ASM_PREFETCH("[%[in1], #192]") - "STP q4, q5, [%[out], #64]\n" - "LDR q6, [%[in2]], #16\n" - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "STP q6, q7, [%[out], #96]\n" - "LDR d8, [%[in2]], #8\n" - "FCVTL v8.4s, v8.4h\n" - ASM_PREFETCH("[%[in2], #192]") - "LDR q9, [%[in3]], #16\n" - "FCVTL2 v10.4s, v9.8h\n" - "FCVTL v9.4s, v9.4h\n" - "STP q8, q9, [%[out], #128]\n" - "LDR d11, [%[in3]], #8\n" - "FCVTL v11.4s, v11.4h\n" - "STP q10, q11, [%[out], #160]\n" - ASM_PREFETCH("[%[in3], #192]") - - : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<12, 1, true, 4, 2>::Transform( - float* out, const __fp16* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp deleted file mode 100644 index b6565baa23..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 12x32-bit sized specialisation -template <> -template <typename T> -inline void TransformImpl<12, 1, true, 4, 4>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 24 x uint16_t specialisation - TransformImpl<24, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t * const>(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 24x16-bit sized specialisation -template <> -template <typename T> -inline void TransformImpl<24, 1, true, 2, 2>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t * const>(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 24 x uint16_t version -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR q2, [%[in0]], #16\n" - "STR q2, [%[out], #32]\n" - : [in0] "+r" (in0), [out] "+r" (out) - : - : "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR q2, [%[in0]], #16\n" - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" - ASM_PREFETCH("[%[in1], #192]") - "LDR q5, [%[in1]], #16\n" - "STP q4, q5, [%[out], #64]\n" - : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - "LDR q2, [%[in0]], #16\n" - ASM_PREFETCH("[%[in0], #192]") - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" - "LDR q5, [%[in1]], #16\n" - ASM_PREFETCH("[%[in1], #192]") - "STP q4, q5, [%[out], #64]\n" - "LDP q6, q7, [%[in2]], #32\n" - "STP q6, q7, [%[out], #96]\n" - "LDR q8, [%[in2]], #16\n" - ASM_PREFETCH("[%[in2], #192]") - "LDP q9, q10, [%[in3]], #32\n" - "STP q8, q9, [%[out], #128]\n" - "LDR q11, [%[in3]], #16\n" - "STP q10, q11, [%[out], #160]\n" - ASM_PREFETCH("[%[in3], #192]") - - : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<24, 1, true, 2, 2>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __arch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp deleted file mode 100644 index 231b3f181e..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -template <unsigned int IntBy, typename TIn, typename TOut> -struct TransposeInterleaveCommon { - // Override the moveblock_1xY methods to improve performance - static inline void moveblock_1x1(const TIn *&in0, TOut *out) { - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in0++); - } - } - - static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) { - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in0++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in1++); - } - } - - static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) { - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in0++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in1++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in2++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast<TOut>(*in3++); - } - } - - static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { - const auto ldin = stride; - - TOut *outarray = out; - const TIn *inarray = in; - TOut *outptr_base = outarray; - const TIn *inptr_base = inarray + x0 + (k0 * ldin); - int ldout = (kmax - k0) * IntBy; - - int k=(kmax-k0); - for ( ; k>3; k-=4) { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - const TIn *inptr3 = inptr2 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - prefetch_3x(inptr3); - - outptr_base += IntBy * 4; - inptr_base += ldin * 4; - - for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { - moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); - outptr += ldout; - } - } - - if (k) { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - - for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { - switch(k) { - case 3: - moveblock_1x2(inptr, inptr1, outptr); - moveblock_1x1(inptr2, outptr + IntBy * 2); - break; - - case 2: - moveblock_1x2(inptr, inptr1, outptr); - break; - - case 1: - moveblock_1x1(inptr, outptr); - break; - default: - break; - } - - outptr += ldout; - } - } - - // Cope with ragged X cases - const unsigned int overflow = (xmax - x0) % IntBy; - if (overflow) { - const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); - TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; - - for (int k=(kmax-k0); k>0; k--) { - const TIn *inptr = inptr_base; - inptr_base += ldin; - - for (unsigned int x=0; x < IntBy; x++) { - TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0); - *outptr++ = val; - } - } - } -} -}; |