From dba672cec878966e465bb476e896c8f75bbd9145 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Thu, 6 Apr 2023 16:30:18 +0100 Subject: Integrate multi-threaded pretranspose_B_array This is required for the case where rhs (B) is dynamic and needs to be pretransposed in every run. In a multi-threaded setting, this means the previously single-threaded pretranspose_B_array would become the bottleneck Resolves COMPMID-5896 Signed-off-by: SiCong Li Change-Id: Id508c46992188a0f76a505152931d4955d04c16d Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9455 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Reviewed-by: Jakub Sujak Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 58 ++++++++++++++++++---- 1 file changed, 49 insertions(+), 9 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 90e2f07607..0bbcd10b66 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -614,6 +614,10 @@ public: return size; } + size_t get_B_pretranspose_window_size() const override { + return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width()); + } + void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { if (std::is_same::value) { _col_bias = reinterpret_cast(in_buffer); @@ -628,25 +632,62 @@ public: } void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { - requantize_bias(in_buffer, B, ldb, B_multi_stride); + pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size()); + } + + void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override { + if (end >= get_B_pretranspose_window_size()) { + requantize_bias(in_buffer, B, ldb, B_multi_stride); + } // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast(in_buffer); - Troi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); - _B_transposed = buffer; + Troi *buffer_base = reinterpret_cast(buffer_int + get_col_sum_size()); + _B_transposed = buffer_base; strategy strat(_args._ci); + size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width()); + + for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) { + // Work out which part of the window space this multi occupies, + // skip to the next multi or exit as needed. + size_t wk_start = multi * work_per_multi; + size_t wk_end = (multi + 1) * work_per_multi; + + assert(wk_end > start); + + if (wk_start >= end) { + break; + } - for (unsigned int multi=0; multi<_args._nmulti; multi++) { for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) { const unsigned int kmax=std::min(k0 + _k_block, _Ktotal); /* Figure out the size of each block. */ unsigned int k_size = kmax - k0; + // Correct the N range and buffer base if we are not processing the whole block. + size_t n_start = 0; + size_t n_end = _args._Nsize; + + // If we are not doing the first columns, update the buffer write position and starting N value. + if (start > wk_start) { + n_start = (start - wk_start) * strategy::out_width(); + } + + // If we are not doing the last items, update the final N value. + if (end < wk_end) { + n_end = (end - wk_start) * strategy::out_width(); + } + + // Set the buffer pointer + Troi *buffer = buffer_base + + (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) + + (n_start * roundup(k_size, strategy::k_unroll())); + if (_args._Ksections > 1) { // We need to insert padding at the end of each K section. - // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // The computation needed is a little delicate - the k0/kmax coordinates are expressed in // terms of the full, padded, _Ktotal. // But we need to transform each section with reference to the original, unpadded, input, letting the // transform pad each section as needed. @@ -657,7 +698,7 @@ public: // The expected output format is also an entire columns interleaved, then the next set of // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at // a time. - for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){ + for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) { unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); // Track where we are and how much work is left. @@ -690,8 +731,7 @@ public: } else { // In the single K section case, can process the whole lot in one go. strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, - 0, _args._Nsize, k0, std::min(kmax, _args._Ksize)); - buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll()); + n_start, n_end, k0, std::min(kmax, _args._Ksize)); } } } -- cgit v1.2.1