From 33d22072b2c30244d3e10913d78691019dc937f4 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Thu, 25 Jan 2018 10:48:46 +0000 Subject: COMPMID-860: Neon HGEMM integrated assembly kernel from RSH for Arm Cortex-A55r1. Change-Id: I640ae54dcc4591915c7a539b27728f05b70cf0eb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117616 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- .../kernels/assembly/kernels/a64_hgemm_24x8.hpp | 5 + .../assembly/kernels/a64_hgemm_24x8/a55r1.hpp | 384 +++++++++++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp (limited to 'arm_compute/core') diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp index ba6d2989c9..5e7684f692 100644 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp @@ -28,6 +28,7 @@ // Get the components we need to implement SGEMM. // Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time. #include "a64_hgemm_24x8/generic.hpp" +#include "a64_hgemm_24x8/a55r1.hpp" // 24x8 HGEMM "strategy" class. Describes the kernel properties. // @@ -56,7 +57,11 @@ public: hgemm_24x8(const struct CPUInfo *ci) { kernel = a64_hgemm_asimd_24x8; + if (ci->CPU == CPUTarget::A55_DOT) { + kernel = a64_hgemm_asimd_24x8_a55r1; + } } + }; #endif // __aarch64__ and FP16_VECTOR_ARITHMETIC diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp new file mode 100644 index 0000000000..1789abb046 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp @@ -0,0 +1,384 @@ +/* + * Copyright (c) 201 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +inline void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k_iters = ((K+1)/2) - 1; + + for (int yb=0; yb