aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2021-04-20 12:15:52 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2021-04-20 17:10:48 +0000
commit778b95cb755880ab4adc972fbf3b7022a99b63f9 (patch)
tree13a4f6060e20a0989c558d7750386b8f32827e4b /src/core/NEON/kernels/arm_gemm/mergeresults.cpp
parentdcf4c87cf78a5f1667699c1a3511d09356938660 (diff)
downloadComputeLibrary-778b95cb755880ab4adc972fbf3b7022a99b63f9.tar.gz
Update assembly code
This patch brings performance uplift on Cortex-A35. Resolves: COMPMID-4316 Change-Id: I2b9c02a599373f780dd1b981b821e33bd59a3422 Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5461 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/mergeresults.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/mergeresults.cpp9
1 files changed, 7 insertions, 2 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
index 8ca947a6dc..adcdc88fd9 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,6 @@
/* As some of the merges need these headers, but are all included in the
* arm_gemm namespace, put these headers here. */
#include <algorithm>
-#include <limits>
#include <arm_neon.h>
@@ -97,6 +96,12 @@ void MergeResults(Tout * out, const Tin * in, int ldc, int y0, int ymax, int x0,
#include "merges/list.hpp"
+/* Cortex-A53 8x6 SGEMM kernel uses a templated merge as the optimized merge
+ * generator cannot cope with the width (6) not being a multiple of VL (4). */
+#ifdef __aarch64__
+template void MergeResults<6u, 8u, false, float, float>(float *, float const*, int, int, int, int, int, float const *, Activation, bool);
+#endif
+
#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
template void MergeResults<12u, 8u, false, float, __fp16>(__fp16*, float const*, int, int, int, int, int, __fp16 const*, Activation, bool);
#endif