diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp | 445 |
1 files changed, 0 insertions, 445 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp deleted file mode 100644 index 376d19cc65..0000000000 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (c) 2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include "arm_gemm.hpp" -#include "utils.hpp" - -#include "mergeresults.hpp" -#include "transform.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -#include <algorithm> -#include <cassert> - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This implementation interleaves the source matrices in blocks - good for -// larger matrices. -namespace arm_gemm { - -template<typename strategy, typename To, typename Tr> -class GemmInterleaved2d : public GemmCommon<To, Tr> { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - /* const properties set by constructor */ - const CPUInfo * const _ci; - - const unsigned int _Msize; - const unsigned int _Nsize; - const unsigned int _Ksize; - - const unsigned int _nbatches; - const unsigned int _nmulti; - - const bool _trA; - const bool _trB; - - const Activation _act; - - const int _maxthreads; - int _nthreads; - - /* Blocking info */ - unsigned int _k_block=0; - unsigned int _x_block=0; - - unsigned int _Mround_div=0; - unsigned int _Mround=0; - unsigned int _Nround_div=0; - unsigned int _Nround=0; - - /* Working space, pretransposed buffer */ - void *_working_space=nullptr; - - /* We will need to walk through the blocks of B in a few contexts, so - * factor that out. */ - class blockwalker { - private: - /* Size loops, etc. based on our parent's configuration */ - const GemmInterleaved2d<strategy, To, Tr> &_parent; - - /* K, X and multi parameters for current iteration. */ - unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; - - unsigned int _index=0; - bool _done=false; - bool _newkblock=true; - bool _newmulti=true; - - public: - blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent) - : _parent(parent) - , _xmax { parent._Nsize } - { } - - blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax) - : _parent(parent) - , _x0 { x0 } - , _xmin { x0 } - , _xmax { xmax } - { - assert(_x0 <= _xmax); - } - - unsigned int xmax() { - return std::min(_x0 + _parent._x_block, _xmax); - } - - unsigned int kmax() { - return std::min(_k0 + _parent._k_block, _parent._Ksize); - } - - /* Advance to the next block, return false at the end. */ - bool advance(void) { - if (_done) { - return false; - } - - _newkblock=false; - _x0 += _parent._x_block; - if (_x0 >= _xmax) { - _x0=_xmin; - _k0 += _parent._k_block; - if (_k0 >= _parent._Ksize) { - _k0=0; - _multi++; - if (_multi >= _parent._nmulti) { - _done=true; - return false; - } - _newmulti=true; - } - _newkblock=true; - } - _index++; - - return true; - } - - unsigned int k0(void) { return _k0; } - unsigned int x0(void) { return _x0; } - unsigned int multi(void) { return _multi; } - unsigned int index(void) { return _index; } - bool done(void) { return _done; } - bool newkblock(void) { return _newkblock; } - }; - - // A working size: One of these needed, regardless of thread count. Divided according to window. - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; - } - - // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings. - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * _x_block * _k_block); - } - - // C working size: One needed per thread. - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); - } - - void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int nthreadid) { - strategy strat(_ci); - - /* Translate 'start' and 'end' into a position within the batches and rows. */ - const unsigned int window_per_batch = _Mround / strategy::out_height(); - unsigned int batch_0 = m_start / window_per_batch; - unsigned int batch_end = m_end / window_per_batch; - - /* Compute the M values to operate on */ - unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); - unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); - - unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); - unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); - - blockwalker current(*this, n_0, n_max); - - /* get workspace as int8_t */ - assert(_working_space); - int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); - - auto c_panel_start = working_space_bytes; - auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; - auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads; - - auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid); - auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * nthreadid); - auto b_panel = reinterpret_cast<Toi *>(b_panel_start + get_b_working_size() * threadid); - - - // newkblock() is always true on the first iteration, so this will be set properly on the first loop. - - int kern_k = 0; - for (;!current.done();current.advance()) { - const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); - /* - * The entirity of A^kblock is transpose upfront and computed against individual - * blocks of B (xblock) - * - * Therefore, we only need to retranspose when k_block progresses - */ - if (current.newkblock()) { - for (unsigned int batch = batch_0; batch <= batch_end; batch++) { - unsigned int first_m = (batch == batch_0) ? m_0 : 0; - unsigned int last_m = (batch == batch_end) ? m_max : _Msize; - - if (first_m >= last_m) - continue; - - auto a_thread_panel_in = this->_Aptr - + (batch * this->_A_batch_stride) - + (current.multi() * this->_A_multi_stride); - - auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); - - strat.transforms.PrepareA( - a_thread_panel_out, - a_thread_panel_in, - this->_lda, - first_m, - last_m, - current.k0(), - current.kmax(), - _trA); - } - - kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); - kern_k *= strat.k_unroll(); - } - - auto *b_panel_in = this->_Bptr + (current.multi() * this->_B_multi_stride); - - strat.transforms.PrepareB( - b_panel, //dst - b_panel_in, //src - this->_ldb, - current.x0(), //idx from - current.xmax(), //idx to - current.k0(), - current.kmax(), - _trB); - - //Iterate over the batches - for (unsigned int batch = batch_0; batch <= batch_end; batch++) { - unsigned int first_m = (batch == batch_0) ? m_0 : 0; - unsigned int last_m = (batch == batch_end) ? m_max : _Msize; - - if (first_m >= last_m) - continue; - - const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; - - - //Iterate over the inerleaved rows of the packed A matrix - for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) { - unsigned int ymax = std::min(_Msize, y + strategy::out_height()); - - strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); - a_ptr += (strategy::out_height() * kern_k); - - const bool first_pass = current.k0()==0; - const bool last_pass = current.kmax()==_Ksize; - - auto c_panel_out = this->_Cptr - + this->_C_batch_stride * batch - + this->_C_multi_stride * current.multi(); - - auto bias = (first_pass && this->_bias) - ? this->_bias + (current.multi() * this->_bias_multi_stride) - : nullptr; - - auto act = last_pass ? _act : Activation(); - - strat.transforms.Merge( - c_panel_out, - c_panel, - this->_ldc, - y, - ymax, - current.x0(), - current.xmax(), - bias, - act, - !first_pass); //Append - } - } - } - } -public: - GemmInterleaved2d(GemmInterleaved2d &) = delete; - GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete; - - /* Constructor */ - /* Constructor */ - GemmInterleaved2d(const GemmArgs &args) - : _ci(args._ci) - , _Msize(args._Msize) - , _Nsize(args._Nsize) - , _Ksize(args._Ksize) - , _nbatches(args._nbatches) - , _nmulti(args._nmulti) - , _trA(args._trA) - , _trB(args._trB) - , _act(args._act) - , _maxthreads(args._maxthreads) - , _nthreads(args._maxthreads) - - // Work out the rounded size of M - needed for some buffers. - , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) - , _Mround ( _Mround_div * strategy::out_height() ) - - , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) - , _Nround ( _Nround_div * strategy::out_width() ) - { - const unsigned int L1_size = _ci->get_L1_cache_size(); - const unsigned int L2_size = _ci->get_L2_cache_size(); - - assert(_maxthreads > 0); - - // Work out blocking parameters, or override from provided GemmConfig - if (args._cfg && args._cfg->inner_block_size) { - _k_block = args._cfg->inner_block_size; - } else { - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - - // Needs to be (at least a single) multiple of the K unroll level. - _k_block /= strategy::k_unroll(); - _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); - - // Now tune to presented problem size; this is how many blocks we need. - unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); - - // So divide the space equally into that many blocks. - _k_block = iceildiv(_Ksize, num_k_blocks); - - // And round UP to the K unroll level required. - _k_block = iceildiv(_k_block, strategy::k_unroll()); - _k_block *= strategy::k_unroll(); - } - - if (args._cfg && args._cfg->outer_block_size) { - _x_block = args._cfg->outer_block_size; - } else { - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); - - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); - - // And tune to the presented problem size. - unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); - - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); - } - - // Work out the rounded size of M - needed for some buffers. - } - - // Interface implementation - Compulsory functions - ndrange_t get_window_size() const override { - unsigned m = (_Mround / strategy::out_height()) * _nbatches; - unsigned n = _Nround_div; - - return { m, n }; - } - - // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. - void set_nthreads(int nthreads) override { - _nthreads = std::min(nthreads, _maxthreads); - } - - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - /* - * This particular GEMM implementation can only be broken up over the M & N - * dimensions, we inform the frame work of this limitation via the get_window_size function - */ - const auto m_start = work_range.get_position(0); - const auto n_start = work_range.get_position(1); - const auto m_size = work_range.get_size(0); - const auto n_size = work_range.get_size(1); - const auto m_end = m_start + m_size; - const auto n_end = n_start + n_size; - - const auto m_threadid = thread_locator.get_position(0); - const auto n_threadid = thread_locator.get_position(1); - - execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); - } - - std::size_t get_working_size()const override { - /* - * Because we do not know how schedular will break up - * the task, we need to ensure that alloc enough - * space to be able to handle the case where every thread - * is parallelised across B AND also every thrread is parallelised across A - * - * If we parallelise across A, then we only need one buffer of A and 64 buffers of B - * If we parallelise across B, then we only need 64 buffer of B and - */ - return get_c_working_size() * _maxthreads - + get_a_working_size() * _maxthreads - + get_b_working_size() * _maxthreads - + 64; //to account for cacheline alignment - } - - - void set_working_space(void *working_space) override { - // Make sure everything ends up cache line aligned - int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); - intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space); - - size_t diff=0; - - if (working_space_int & 0x3F) { - diff = 0x40 - (working_space_int & 0x3F); - } - - working_space_bytes += diff; - - _working_space = reinterpret_cast<void *>(working_space_bytes); - } - - ~GemmInterleaved2d() override { } -}; - -} // namespace arm_gemm |