diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2020-07-06 19:10:38 +0100 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2020-07-07 15:07:37 +0000 |
commit | 0cc50ed757f06f4f076e261cb7253dd67264dec6 (patch) | |
tree | 866306f780f7529ea172d78210355aed761ad7a4 /src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp | |
parent | ab23dd0fbc632063235a6ad408241dc79a35d3e4 (diff) | |
download | ComputeLibrary-0cc50ed757f06f4f076e261cb7253dd67264dec6.tar.gz |
COMPMID-3324: Remove pretransposed support from NEON backend
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I394c6c539969940e0119cbc14174909d47e65de6
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3519
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp | 281 |
1 files changed, 92 insertions, 189 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index f572f7940b..3b829491ca 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -31,7 +31,6 @@ #include "arm_gemm.hpp" #include "utils.hpp" -#include "buffer_manager.hpp" #include "mergeresults.hpp" #include "transform.hpp" @@ -65,14 +64,10 @@ class GemmInterleaved : public GemmCommon<To, Tr> { const unsigned int _nbatches; const unsigned int _nmulti; - const bool _trA; - const bool _trB; - const Activation _act; const int _maxthreads; int _nthreads; - const bool _pretransposed; /* Blocking info */ unsigned int _k_block=0; @@ -81,7 +76,6 @@ class GemmInterleaved : public GemmCommon<To, Tr> { /* Working space, pretransposed buffer, buffer manager */ const Toi *_B_transposed=nullptr; - BufferManager *_bm=nullptr; void *_working_space=nullptr; /* We will need to walk through the blocks of B in a few contexts, so @@ -150,27 +144,100 @@ class GemmInterleaved : public GemmCommon<To, Tr> { return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches); } - // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings. - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * _x_block * _k_block); - } - // C working size: One needed per thread. size_t get_c_working_size() const { return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); } - // Internal execute function. - // This supports both the "pretransposed" and "standard" interfaces via the template parameter. - template<bool pretransposed> - void execute_internal(unsigned int start, unsigned int end, int threadid) { + +public: + GemmInterleaved(GemmInterleaved &) = delete; + GemmInterleaved & operator= (GemmInterleaved &) = delete; + + /* Constructor */ + GemmInterleaved(const GemmArgs &args) + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmulti(args._nmulti), + _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads) { + const unsigned int L1_size = _ci->get_L1_cache_size(); + const unsigned int L2_size = _ci->get_L2_cache_size(); + + assert(_maxthreads > 0); + + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } + + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } + + // Work out the rounded size of M - needed for some buffers. + _Mround = iceildiv(_Msize, strategy::out_height()); + _Mround *= strategy::out_height(); + } + + // Interface implementation - Compulsory functions + + // Window size: Only the last thread should do a ragged block, so dole + // out work in units of out_height. Factor batches into the window, but + // not multi for now (as this would cause problems with the buffer + // manager). + ndrange_t get_window_size() const override { + // _Mround is a multiple of out_height by definition. + return { (_Mround / strategy::out_height()) * _nbatches }; + } + + // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. + void set_nthreads(int nthreads) override { + _nthreads = std::min(nthreads, _maxthreads); + } + + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { + const auto start = work_range.get_position(0); + const auto end = work_range.get_position_end(0); #ifdef CYCLE_PROFILING profiler prof; #endif strategy strat(_ci); blockwalker current(*this); - blockwalker next=current; /* Translate 'start' and 'end' into a position within the batches and rows. */ const unsigned int window_per_batch = _Mround / strategy::out_height(); @@ -182,12 +249,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> { unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height(); /* Make sure we've been set up correctly. */ - if (pretransposed) { - assert(_B_transposed); - } else { - assert(_bm); - } - + assert(_B_transposed); assert(_working_space); int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); @@ -198,12 +260,8 @@ class GemmInterleaved : public GemmCommon<To, Tr> { Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size())); Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size())); - // Shared buffers - these come either from BufferManager or _B_transposed. const Toi *b_panel; - - if (pretransposed) { - b_panel = _B_transposed; - } + b_panel = _B_transposed; //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block); @@ -224,7 +282,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> { strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block), this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride), - this->_lda, first_m, last_m, current.k0(), current.kmax(), _trA); + this->_lda, first_m, last_m, current.k0(), current.kmax()); } // Figure out how many "K" the kernel will actually process. @@ -234,41 +292,6 @@ class GemmInterleaved : public GemmCommon<To, Tr> { int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); - if (!pretransposed) { - /* Look ahead to the next block and populate it if necessary. - * This avoids the populate operation becoming a bottleneck, and - * helps keep the threads synchronized (the first thread to get - * here will populate while the rest will advance). - * - * If we are running single threaded, bm->try_populate() will do - * nothing. - */ - if (next.advance()) { - _bm->try_populate(next.index(), [&](void *buffer) { -#ifdef CYCLE_PROFILING - auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi)); -#endif - - Toi *b_panel = reinterpret_cast<Toi *>(buffer); - - strat.transforms.PrepareB(b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb, - next.x0(), next.xmax(), next.k0(), next.kmax(), _trB); - }); - } - - /* Get the buffer for this iteration from the BufferManager. */ - b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) { -#ifdef CYCLE_PROFILING - auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi)); -#endif - - Toi *b_panel = reinterpret_cast<Toi *>(bpv); - - strat.transforms.PrepareB(b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb, - current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); - })); - } - /* Do the actual work. */ for (unsigned int batch = batch_0; batch <= batch_end; batch++) { unsigned int first_m = (batch == batch_0) ? m_0 : 0; @@ -308,105 +331,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> { } } - if (pretransposed) { - b_panel += (bblocks * strat.out_width() * kern_k); - } else { - _bm->release(current.index()); - } - } - } - -public: - GemmInterleaved(GemmInterleaved &) = delete; - GemmInterleaved & operator= (GemmInterleaved &) = delete; - - /* Constructor */ - GemmInterleaved(const GemmArgs &args) - : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), - _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB), - _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads), - _pretransposed(args._pretransposed_hint) { - const unsigned int L1_size = _ci->get_L1_cache_size(); - const unsigned int L2_size = _ci->get_L2_cache_size(); - - assert(_maxthreads > 0); - - // Work out blocking parameters, or override from provided GemmConfig - if (args._cfg && args._cfg->inner_block_size) { - _k_block = args._cfg->inner_block_size; - } else { - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - - // Needs to be (at least a single) multiple of the K unroll level. - _k_block /= strategy::k_unroll(); - _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); - - // Now tune to presented problem size; this is how many blocks we need. - unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); - - // So divide the space equally into that many blocks. - _k_block = iceildiv(_Ksize, num_k_blocks); - - // And round UP to the K unroll level required. - _k_block = iceildiv(_k_block, strategy::k_unroll()); - _k_block *= strategy::k_unroll(); - } - - if (args._cfg && args._cfg->outer_block_size) { - _x_block = args._cfg->outer_block_size; - } else { - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); - - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); - - // And tune to the presented problem size. - unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); - - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); - } - - // Work out the rounded size of M - needed for some buffers. - _Mround = iceildiv(_Msize, strategy::out_height()); - _Mround *= strategy::out_height(); - } - - // Interface implementation - Compulsory functions - - // Window size: Only the last thread should do a ragged block, so dole - // out work in units of out_height. Factor batches into the window, but - // not multi for now (as this would cause problems with the buffer - // manager). - ndrange_t get_window_size() const override { - // _Mround is a multiple of out_height by definition. - return { (_Mround / strategy::out_height()) * _nbatches }; - } - - // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. - void set_nthreads(int nthreads) override { - _nthreads = std::min(nthreads, _maxthreads); - if (_bm) { - _bm->set_nthreads(_nthreads); - } - } - - // Execute - void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { - const auto start = work_range.get_position(0); - const auto end = work_range.get_position_end(0); - - if (_pretransposed) { - execute_internal<true>(start, end, threadid); - } else { - execute_internal<false>(start, end, threadid); + b_panel += (bblocks * strat.out_width() * kern_k); } } @@ -415,12 +340,6 @@ public: // In all cases, we need one A buffer plus a C buffer per thread. size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads); - // For pretransposed case, there is no working space needed for B. - // Otherwise, we need a BufferManager. - if (!_pretransposed) { - size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size()); - } - size += 64; // Add on a cache line extra for alignment. return size; @@ -439,29 +358,17 @@ public: working_space_bytes += diff; - if (_pretransposed) { - // Pretransposed case: just set internal pointer to parameter value. - _working_space = reinterpret_cast<void *>(working_space_bytes); - } else { - // Otherwise, use the first part of the working space for the buffer manager. - // It's legal to call this again so don't leak a buffer manager if it already existed. - delete _bm; - - _bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes)); - - working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size()); - - _working_space = reinterpret_cast<void *>(working_space_bytes); - } + // Pretransposed case: just set internal pointer to parameter value. + _working_space = reinterpret_cast<void *>(working_space_bytes); } // Interface implementation - pretransposed bool B_is_pretransposed() const override { - return _pretransposed; + return true; } bool B_pretranspose_required() const override { - return _pretransposed && (_B_transposed==nullptr); + return (_B_transposed==nullptr); } // TODO: this could almost certainly be considerably simpler. @@ -506,7 +413,7 @@ public: k_size *= strategy::k_unroll(); strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, - current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + current.x0(), current.xmax(), current.k0(), current.kmax()); buffer += (x_size * k_size); } while (current.advance()); @@ -515,10 +422,6 @@ public: void set_pretransposed_B_data(void *in_buffer) override { _B_transposed = reinterpret_cast<Toi *>(in_buffer); } - - ~GemmInterleaved() override { - delete _bm; - } }; } // namespace arm_gemm |