diff options
author | Gunes Bayir <gunes.bayir@arm.com> | 2024-02-12 21:32:51 +0000 |
---|---|---|
committer | Gunes Bayir <gunes.bayir@arm.com> | 2024-02-21 10:36:22 +0000 |
commit | ef637398a8c2060e15de438020c53331da8bd6dd (patch) | |
tree | b1a1738736c9b6b49e76767e44bf4b77bf732876 /src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | |
parent | 0a48c4c83b598991b4d4235f870c24d9e6634b20 (diff) | |
download | ComputeLibrary-ef637398a8c2060e15de438020c53331da8bd6dd.tar.gz |
Integrate new pretranspose_b_array with extra fused transpose of B
This patch fuses the transposition taking place in Acl with the transformations done in arm_gemm (called pretranspose_b_array) if the underlying kernel and transform supports it. This should improve start-up time (as it's for constant Rhs matrices) and memory footprint. The transformations in arm_gemm are kernel specific. The Rhs matrix is transformed into certain layouts to improve the performance.
Resolves: COMPMID-6595
Change-Id: Id2932dd966e59f903c279417bebcea83d9a42464
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11144
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 18 |
1 files changed, 12 insertions, 6 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 1780375c44..89c2d5a23e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -631,11 +631,16 @@ public: } } - void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { - pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size()); + bool B_pretranspose_supports_transpose() const override { + strategy strat(_args._ci); + return strat.transforms.PrepareB_supports_transpose(); + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override { + pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size()); } - void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override { + void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override { if (end >= get_B_pretranspose_window_size()) { requantize_bias(in_buffer, B, ldb, B_multi_stride); } @@ -717,7 +722,8 @@ public: strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, x0, xmax, (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. - (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. + (k_section_base * _args._Ksize) + k_offset + k_length, // K end point - starting point plus length computed above. + transposed); // We need to modify our position based on the ROUNDED version of what we just did. unsigned int padded_length = roundup(k_length, strategy::k_unroll()); @@ -731,7 +737,7 @@ public: } else { // In the single K section case, can process the whole lot in one go. strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, - n_start, n_end, k0, std::min(kmax, _args._Ksize)); + n_start, n_end, k0, std::min(kmax, _args._Ksize), transposed); } } } |