From dba672cec878966e465bb476e896c8f75bbd9145 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Thu, 6 Apr 2023 16:30:18 +0100 Subject: Integrate multi-threaded pretranspose_B_array This is required for the case where rhs (B) is dynamic and needs to be pretransposed in every run. In a multi-threaded setting, this means the previously single-threaded pretranspose_B_array would become the bottleneck Resolves COMPMID-5896 Signed-off-by: SiCong Li Change-Id: Id508c46992188a0f76a505152931d4955d04c16d Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9455 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Reviewed-by: Jakub Sujak Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- arm_compute/runtime/IScheduler.h | 6 +-- .../NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 58 ++++++++++++++++++---- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 39 +++++++++++++-- src/cpu/kernels/assembly/gemm_common.hpp | 24 ++++++++- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 50 +++++++++++++++++-- src/runtime/IScheduler.cpp | 6 +-- 6 files changed, 159 insertions(+), 24 deletions(-) diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h index 129009c58d..df5a44001f 100644 --- a/arm_compute/runtime/IScheduler.h +++ b/arm_compute/runtime/IScheduler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -176,9 +176,9 @@ public: /** Execute all the passed workloads * - * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel. + * @note There is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel. * - * @param[in] workloads Array of workloads to run + * @param[in] workloads List of workloads to run * @param[in] tag String that can be used by profiling tools to identify the workloads run by the scheduler (Can be null). */ virtual void run_tagged_workloads(std::vector &workloads, const char *tag); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 90e2f07607..0bbcd10b66 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -614,6 +614,10 @@ public: return size; } + size_t get_B_pretranspose_window_size() const override { + return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width()); + } + void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { if (std::is_same::value) { _col_bias = reinterpret_cast(in_buffer); @@ -628,25 +632,62 @@ public: } void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { - requantize_bias(in_buffer, B, ldb, B_multi_stride); + pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size()); + } + + void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override { + if (end >= get_B_pretranspose_window_size()) { + requantize_bias(in_buffer, B, ldb, B_multi_stride); + } // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast(in_buffer); - Troi *buffer = reinterpret_cast(buffer_int + get_col_sum_size()); - _B_transposed = buffer; + Troi *buffer_base = reinterpret_cast(buffer_int + get_col_sum_size()); + _B_transposed = buffer_base; strategy strat(_args._ci); + size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width()); + + for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) { + // Work out which part of the window space this multi occupies, + // skip to the next multi or exit as needed. + size_t wk_start = multi * work_per_multi; + size_t wk_end = (multi + 1) * work_per_multi; + + assert(wk_end > start); + + if (wk_start >= end) { + break; + } - for (unsigned int multi=0; multi<_args._nmulti; multi++) { for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) { const unsigned int kmax=std::min(k0 + _k_block, _Ktotal); /* Figure out the size of each block. */ unsigned int k_size = kmax - k0; + // Correct the N range and buffer base if we are not processing the whole block. + size_t n_start = 0; + size_t n_end = _args._Nsize; + + // If we are not doing the first columns, update the buffer write position and starting N value. + if (start > wk_start) { + n_start = (start - wk_start) * strategy::out_width(); + } + + // If we are not doing the last items, update the final N value. + if (end < wk_end) { + n_end = (end - wk_start) * strategy::out_width(); + } + + // Set the buffer pointer + Troi *buffer = buffer_base + + (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) + + (n_start * roundup(k_size, strategy::k_unroll())); + if (_args._Ksections > 1) { // We need to insert padding at the end of each K section. - // The computation needed is a little delicate - the coordinates from the block walker are expressed in + // The computation needed is a little delicate - the k0/kmax coordinates are expressed in // terms of the full, padded, _Ktotal. // But we need to transform each section with reference to the original, unpadded, input, letting the // transform pad each section as needed. @@ -657,7 +698,7 @@ public: // The expected output format is also an entire columns interleaved, then the next set of // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at // a time. - for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){ + for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) { unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); // Track where we are and how much work is left. @@ -690,8 +731,7 @@ public: } else { // In the single K section case, can process the whole lot in one go. strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, - 0, _args._Nsize, k0, std::min(kmax, _args._Ksize)); - buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll()); + n_start, n_end, k0, std::min(kmax, _args._Ksize)); } } } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index c2fd0b0e8c..13f548e39e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -31,6 +31,7 @@ #include "convolver.hpp" #include "kernel_weight_format.hpp" #include "kernel_traits.hpp" +#include "kernel_weight_format.hpp" #include "mergeresults.hpp" #include "performance_parameters.hpp" #include "quantized.hpp" @@ -1039,6 +1040,13 @@ public: return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size(); } + size_t get_B_pretranspose_window_size() const override { + size_t n_blocks = iceildiv(_Nsize, _x_block); + size_t k_blocks = iceildiv(_Ktotal, _k_block); + + return n_blocks * k_blocks * _nmulti; + } + void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { if (std::is_same::value) { col_bias = reinterpret_cast(in_buffer); @@ -1053,7 +1061,14 @@ public: } void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { - requantize_bias(in_buffer, B, ldb, B_multi_stride); + pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size()); + } + + void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override { + // Perform column sums etc as part of the last block. + if (end >= get_B_pretranspose_window_size()) { + requantize_bias(in_buffer, B, ldb, B_multi_stride); + } // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast(in_buffer); @@ -1063,7 +1078,20 @@ public: blockwalker current(*this); strategy strat(_ci); - do { + // Skip over blocks we aren't doing + for(size_t i = 0; i < start; i++) { + buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll()); + current.advance(); + } + + size_t blocks_left = (end - start); + + // Double check that we haven't run out of work + if (current.done()) { + blocks_left = 0; + } + + for (/* blocks_left initialized above */; blocks_left > 0; blocks_left--) { /* Figure out the size of each block. */ unsigned int k_size = (current.kmax() - current.k0()); @@ -1117,7 +1145,12 @@ public: current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize)); buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll()); } - } while (current.advance()); + + // Advance to the next block, break if we run off the end. + if (!current.advance()) { + break; + } + } } void set_pretransposed_B_data(void *in_buffer) override { diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp index ece9ca5802..834cd1061e 100644 --- a/src/cpu/kernels/assembly/gemm_common.hpp +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021,2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -113,9 +113,17 @@ public: { return 0; } + /* Amount of work for the threaded cases */ + virtual size_t get_B_pretranspose_window_size() const + { + return 1; + } /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ /* The "real" version of this depends on the templated operand type (see below). */ virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; + /* Threaded version with window start/end parameters */ + virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; + /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ virtual void set_pretransposed_B_data(void *) { @@ -225,6 +233,20 @@ public: pretranspose_B_array(out, static_cast(in), row_stride, multi_stride); } + /* Threaded versions of the above. + * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and + * just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only + * legal values for start and end are 0 and 1 respectively. */ + virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) + { + pretranspose_B_array(out, in, row_stride, multi_stride); + }; + + void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override + { + pretranspose_B_array_part(out, static_cast(in), row_stride, multi_stride, start, end); + } + /*** Indirect interface ***/ virtual void set_indirect_parameters(size_t, const To *const *const *) { diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 9af98be41d..9c85631406 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -38,6 +38,46 @@ namespace arm_compute { namespace cpu { +namespace +{ +/** Run pretranspose_B_array in parallel (1D static scheduling) + * + * @tparam TypeInput + * @tparam TypeOutput + * + * @param[in] gemm_asm GemmCommon kernel to run + * @param[in] dst Pretransposed B array + * @param[in] src B array to be pretransposed + * @param[in] src_ld Stride in y + * @param[in] src_multi_stride Stride in z ("multi") + * @param[in] num_threads Number of threads to run this method. Must be >= 1 + */ +template +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads) +{ + ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); + ARM_COMPUTE_ERROR_ON(num_threads == 0); + // The window size is also the total workload size + const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size(); + + std::vector workloads(num_threads); + for(unsigned int t = 0; t < num_threads; ++t) + { + workloads[t] = [ = ](const ThreadInfo & info) + { + const unsigned int start = (info.thread_id * wsize) / num_threads; + const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads; + + if(start < end) + { + gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end); + } + }; + } + NEScheduler::get().run_tagged_workloads(workloads, "CpuGemmAssemblyDispatch/pretranspose_B_array"); +} +} // namespace + using namespace arm_compute::experimental; namespace @@ -436,7 +476,7 @@ void Fallback::prepare(ITensorPack &tensors) CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b); + run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); b->mark_as_unused(); } @@ -493,9 +533,9 @@ void Fallback::run(ITensorPack &tensors) // Check if B is pre-tranposed and de-reference if not if(!_gemm_kernel_asm->B_is_pretransposed()) { - ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); - in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); + ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); } // If necessary, run pretranspose every time if either weights or biases are non-constant @@ -522,7 +562,7 @@ void Fallback::run(ITensorPack &tensors) } else { - _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b); + run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); } } } diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index 39f41555fa..436fd9ca16 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -139,7 +139,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W default: ARM_COMPUTE_ERROR("Unknown strategy"); } - // Make sure the smallest window is larger than minimim workload size + // Make sure the smallest window is larger than minimum workload size num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info()); std::vector workloads(num_windows); @@ -178,7 +178,7 @@ void IScheduler::run_tagged_workloads(std::vector &workloads, const ch std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info) { // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow"). - if(window.num_iterations(split_dimension) < init_num_windows ) + if(window.num_iterations(split_dimension) < init_num_windows) { auto recommended_split_dim = Window::DimX; for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims) -- cgit v1.2.1