From 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 16 Jul 2021 16:16:43 +0100 Subject: Update GEMM assembly kernels - Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../a64_interleaved_bf16fp32_mmla_8x12/generic.cpp | 647 ++++++++------------- 1 file changed, 255 insertions(+), 392 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12') diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp index c476fcf171..94c72a31c9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,406 +23,269 @@ */ #ifdef __aarch64__ +#include #include "../../bfloat.hpp" -#include "../../asmlib.hpp" namespace arm_gemm { -void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const bfloat16 *a_ptr = Apanel; - float *c_ptr = Cpanel; +void a64_interleaved_bf16fp32_mmla_8x12( + const bfloat16 *Apanel, const bfloat16 *Bpanel, + float *Cpanel, int ablocks, int bblocks, int K) { - K /= 4; - const long loops_count = (K / 2) - 1; - const long tails_count = K % 2; + struct KernelArgs { + size_t bblocks = {}; + size_t K = {}; + const bfloat16 *Bpanel = {}; + } ka; - for (int yb=0; yb