diff options
author | Michalis Spyrou <michalis.spyrou@arm.com> | 2021-04-20 12:15:52 +0100 |
---|---|---|
committer | Michalis Spyrou <michalis.spyrou@arm.com> | 2021-04-20 17:10:48 +0000 |
commit | 778b95cb755880ab4adc972fbf3b7022a99b63f9 (patch) | |
tree | 13a4f6060e20a0989c558d7750386b8f32827e4b /src/core/NEON/kernels/arm_gemm/mergeresults.cpp | |
parent | dcf4c87cf78a5f1667699c1a3511d09356938660 (diff) | |
download | ComputeLibrary-778b95cb755880ab4adc972fbf3b7022a99b63f9.tar.gz |
Update assembly code
This patch brings performance uplift on Cortex-A35.
Resolves: COMPMID-4316
Change-Id: I2b9c02a599373f780dd1b981b821e33bd59a3422
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5461
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/mergeresults.cpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/mergeresults.cpp | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp index 8ca947a6dc..adcdc88fd9 100644 --- a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp +++ b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2017-2018, 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,6 @@ /* As some of the merges need these headers, but are all included in the * arm_gemm namespace, put these headers here. */ #include <algorithm> -#include <limits> #include <arm_neon.h> @@ -97,6 +96,12 @@ void MergeResults(Tout * out, const Tin * in, int ldc, int y0, int ymax, int x0, #include "merges/list.hpp" +/* Cortex-A53 8x6 SGEMM kernel uses a templated merge as the optimized merge + * generator cannot cope with the width (6) not being a multiple of VL (4). */ +#ifdef __aarch64__ +template void MergeResults<6u, 8u, false, float, float>(float *, float const*, int, int, int, int, int, float const *, Activation, bool); +#endif + #if defined(__aarch64__) && defined(__ARM_FP16_ARGS) template void MergeResults<12u, 8u, false, float, __fp16>(__fp16*, float const*, int, int, int, int, int, __fp16 const*, Activation, bool); #endif |