aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
diff options
context:
space:
mode:
authorMichael Tyler <michael.tyler@arm.com>2023-04-12 17:43:17 +0100
committermichael.tyler <michael.tyler@arm.com>2023-06-05 15:57:58 +0000
commit74921eee924625426429044decefe3673561b174 (patch)
tree654da1a95e3d42d6af8ad1ff27bb40d77b1fd8c5 /src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
parentdf5d9878008be9b60586df97ebfff197abb5195e (diff)
downloadComputeLibrary-74921eee924625426429044decefe3673561b174.tar.gz
Update CPU kernel implementations and guard directives
Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler <michael.tyler@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp22
1 files changed, 21 insertions, 1 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
index 4f25da2877..b921fd16d2 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,8 +39,12 @@
*/
template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int int_by = height_vectors;
+#endif
std::vector<int32_t> the_sums;
@@ -104,8 +108,12 @@ void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
// If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
if (row_sum_multiplier) {
@@ -138,8 +146,12 @@ void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int
unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
const unsigned int k0, const unsigned int kmax, bool integrate_sums,
const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
// 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
// pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
@@ -208,8 +220,12 @@ void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
// Use alloca here as a std::vector can be expensive in highly threaded scenarios.
@@ -246,8 +262,12 @@ void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const con
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+#ifdef ARM_COMPUTE_ENABLE_SVE
const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
(vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
+#else
+ const unsigned int height = height_vectors;
+#endif
// Use alloca here as a std::vector can be expensive in highly threaded scenarios.
const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));