aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2024-02-12 21:32:51 +0000
committerGunes Bayir <gunes.bayir@arm.com>2024-02-21 10:36:22 +0000
commitef637398a8c2060e15de438020c53331da8bd6dd (patch)
treeb1a1738736c9b6b49e76767e44bf4b77bf732876 /src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
parent0a48c4c83b598991b4d4235f870c24d9e6634b20 (diff)
downloadComputeLibrary-ef637398a8c2060e15de438020c53331da8bd6dd.tar.gz
Integrate new pretranspose_b_array with extra fused transpose of B
This patch fuses the transposition taking place in Acl with the transformations done in arm_gemm (called pretranspose_b_array) if the underlying kernel and transform supports it. This should improve start-up time (as it's for constant Rhs matrices) and memory footprint. The transformations in arm_gemm are kernel specific. The Rhs matrix is transformed into certain layouts to improve the performance. Resolves: COMPMID-6595 Change-Id: Id2932dd966e59f903c279417bebcea83d9a42464 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11144 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp20
1 files changed, 14 insertions, 6 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 362a3e30ea..4f732f7d94 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1067,11 +1067,18 @@ public:
}
}
- void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
- pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size());
+ // Support for transposed B is a property of the strategy::transpose type
+ bool B_pretranspose_supports_transpose() const override {
+ typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+ return transforms.PrepareB_supports_transpose();
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, const bool transposed) override {
+ pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
}
- void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override {
+ void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, const bool transposed, size_t start, size_t end) override {
// Perform column sums etc as part of the last block.
if (end >= get_B_pretranspose_window_size()) {
requantize_bias(in_buffer, B, ldb, B_multi_stride);
@@ -1134,7 +1141,8 @@ public:
strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
x0, xmax,
(k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
- (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
+ (k_section_base * _Ksize) + k_offset + k_length, // K end point - starting point plus length computed above.
+ transposed);
// We need to modify our position based on the ROUNDED version of what we just did.
unsigned int padded_length = roundup(k_length, strategy::k_unroll());
@@ -1149,7 +1157,7 @@ public:
// In the single K section case, can process the whole lot in one go.
// Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize.
strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
- current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize));
+ current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize), transposed);
buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
}