diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 143 |
1 files changed, 80 insertions, 63 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index aeeed26702..a6c9677305 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,18 +23,15 @@ */ #pragma once -#include <assert.h> - #include <algorithm> +#include <cassert> #include "arm_gemm.hpp" #include "bias_adder.hpp" -#include "utils.hpp" - -#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp" - -#include "mergeresults.hpp" +#include "ndrange.hpp" +#include "performance_parameters.hpp" #include "transform.hpp" +#include "utils.hpp" #ifdef CYCLE_PROFILING #include "profiler.hpp" @@ -58,8 +55,6 @@ class GemmHybrid : public GemmCommon<To, Tr> { const unsigned int _nbatches; const unsigned int _nmulti; - const bool _trB; - const Activation _act; /* Blocking info */ @@ -73,60 +68,58 @@ class GemmHybrid : public GemmCommon<To, Tr> { const NDRange<4> _window_range; static unsigned int compute_k_block(const GemmArgs &args) { - // Some kernels don't support append mode - these can't do K blocking at all. - if (!strategy::supports_append()) { + // Some kernels don't support accumulate mode - these can't do K blocking at all. + if (!strategy::supports_accumulate()) { return args._Ksize; } if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } - const unsigned int L1_size = args._ci->get_L1_cache_size(); + // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this. + unsigned int target_block_size = 2048 / sizeof(To); - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + if (args._Ksize >= ((3 * target_block_size) / 2)) { + unsigned int target_blocks = iceildiv(args._Ksize, target_block_size); - // Needs to be (at least a single) multiple of the K unroll level. - k_block /= strategy::k_unroll(); - k_block = std::max(k_block, 1U) * strategy::k_unroll(); + unsigned int block_size = iceildiv(args._Ksize, target_blocks); - // Now tune to presented problem size; this is how many blocks we need. - unsigned int numk_blocks = iceildiv(args._Ksize, k_block); + block_size = roundup(block_size, strategy::k_unroll()); - // So divide the space equally into that many blocks. - k_block = iceildiv(args._Ksize, numk_blocks); - - // And round UP to the K unroll level required. - k_block = roundup(k_block, strategy::k_unroll()); + return block_size; + } - return k_block; + return args._Ksize; } + // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a + // single block. static unsigned int compute_n_block(const GemmArgs &args) { if (args._cfg && args._cfg->outer_block_size) { - return args._cfg->outer_block_size; - } + unsigned int n_block = args._cfg->outer_block_size; - const unsigned int k_block = compute_k_block(args); - const unsigned int L2_size = args._ci->get_L2_cache_size(); + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1u) * strategy::out_width(); - // n_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * k_block); + return n_block; + } - // Needs to be (at least a single) multiple of the kernel output width. - n_block /= strategy::out_width(); - n_block = std::max(n_block, 1U) * strategy::out_width(); + if (args._Nsize <= 64) { + return args._Nsize; + } - // And tune to the presented problem size. - unsigned int numblocks = iceildiv(args._Nsize, n_block); - n_block = iceildiv(args._Nsize, numblocks); - n_block = roundup(n_block, strategy::out_width()); + if ((args._Msize / args._Nsize) > 155) { + return args._Nsize; + } - return n_block; + // Go slightly wider if thread count and depth are small. + if ((args._Ksize <= 128) && (args._maxthreads <= 16)) { + return strategy::out_width() * 3; + } + + return strategy::out_width(); } public: @@ -136,7 +129,7 @@ public: /* Constructor */ GemmHybrid(const GemmArgs &args) : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), - _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB), + _nbatches(args._nbatches), _nmulti(args._nmulti), _act(args._act), _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), _Mround(roundup(args._Msize, strategy::out_height())), @@ -144,7 +137,7 @@ public: // Interface implementation - Compulsory functions ndrange_t get_window_size() const override { - return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; + return { _window_range.total_size() }; } // This kernel can always be dynamically scheduled. @@ -152,8 +145,8 @@ public: return true; } - void execute_1d(unsigned int start, unsigned int end, int threadid) { - UNUSED(threadid); + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -174,7 +167,7 @@ public: const bool first_pass = (k0 == 0); const bool last_pass = (kmax == _Ksize); - auto p = _window_range.iterator(start, end); + auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); if (p.done()) { return; @@ -194,7 +187,7 @@ public: (n0 * kern_k); #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, @@ -215,17 +208,6 @@ public: } } - // Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto size = work_range.get_size(0); - const auto stop = start + size; - - execute_1d(start, stop, threadid); - } - // Interface implementation - pretransposed bool B_is_pretransposed() const override { return true; @@ -239,7 +221,9 @@ public: return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi); } - void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override { + assert(!transposed); + Toi *buffer = reinterpret_cast<Toi *>(in_buffer); _B_transposed = buffer; strategy strat(_ci); @@ -255,7 +239,7 @@ public: const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size; strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb, - x0, xmax, k0, kmax, _trB); + x0, xmax, k0, kmax, false); buffer += size; } @@ -266,6 +250,39 @@ public: void set_pretransposed_B_data(void *in_buffer) override { _B_transposed = reinterpret_cast<Toi *>(in_buffer); } + + // Estimate cycles for given problem given provided parameters + static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { + // Note: Current hybrid kernels don't actually round up height (they + // have paths for each possible height). Might need to make this + // configurable in future. + uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll()); + + float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle; + + // TODO: A bit of a kludge here: current hybrid kernels incur extra + // overhead where the width is not a multiple of kernel width. It's + // most noticable where the overall width is quite low, so add 15% + // penalty for such widths. + if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) { + mac_cycles *= 1.15f; + } + + uint64_t total_cycles = mac_cycles; + + return total_cycles; + } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_HYBRID; + c.inner_block_size = _k_block; + c.outer_block_size = _n_block; + c.filter = get_type_name<strategy>(); + + return c; + } }; } // namespace arm_gemm |