diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 24 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 20 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp | 20 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp | 20 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp | 449 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp | 514 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_native.hpp | 18 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemv_batched.hpp | 8 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp | 19 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 19 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/ndrange.hpp | 85 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp | 9 | ||||
-rw-r--r-- | src/runtime/CPP/CPPScheduler.cpp | 168 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp | 15 |
14 files changed, 1309 insertions, 79 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 96e3ce832c..e3355ed2d5 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -26,6 +26,8 @@ #include "gemm_hybrid.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" +#include "gemm_interleaved_2d.hpp" +#include "gemm_interleaved_pretransposed_2d.hpp" #include "gemm_native.hpp" #include "gemv_batched.hpp" #include "gemv_native_transposed.hpp" @@ -144,13 +146,31 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] = [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); } }, #endif // __ARM_FEATURE_SVE +//Pretranpose, 2D split +{ + GemmMethod::GEMM_INTERLEAVED_2D, + "sgemm_12x8", + [](const GemmArgs &args) { return args._pretransposed_hint; }, + [](const GemmArgs &args) { return args._pretransposed_hint; }, + [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); } +}, +//Tranpose, 2D split, no blockmanager +{ + GemmMethod::GEMM_INTERLEAVED_2D, + "sgemm_12x8", + [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; }, + [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; }, + [](const GemmArgs &args) { return new GemmInterleaved2d<sgemm_12x8, float, float>(args); } +}, +//Tranpose, 1D split, with blockmanager { GemmMethod::GEMM_INTERLEAVED, "sgemm_12x8", - nullptr, - nullptr, + [](const GemmArgs &args) { return (!args._pretransposed_hint); }, + [](const GemmArgs &args) { return (!args._pretransposed_hint); }, [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); } }, + #endif // __aarch64__ #ifdef __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index c3abb04db7..0cb3160de4 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -142,8 +142,8 @@ public: _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { } // Interface implementation - Compulsory functions - unsigned int get_window_size() const override { - return _window_range.total_size(); + ndrange_t get_window_size() const override { + return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; } // This kernel can always be dynamically scheduled. @@ -151,8 +151,7 @@ public: return true; } - // Execute - void execute(unsigned int start, unsigned int end, int threadid) override { + void execute_1d(unsigned int start, unsigned int end, int threadid) { UNUSED(threadid); #ifdef CYCLE_PROFILING profiler prof; @@ -215,6 +214,17 @@ public: } } + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } + // Interface implementation - pretransposed bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp index 22b6960baf..3d7ad99d1e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -149,8 +149,8 @@ public: _qp (qp), _nthreads(args._maxthreads) { } // Interface implementation - Compulsory functions - unsigned int get_window_size() const override { - return _window_range.total_size(); + ndrange_t get_window_size() const override { + return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; } // This kernel can always be dynamically scheduled. @@ -158,8 +158,7 @@ public: return true; } - // Execute - void execute(unsigned int start, unsigned int end, int threadid) override { + void execute_1d(unsigned int start, unsigned int end, int threadid) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -234,6 +233,17 @@ public: } } + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } + // Working space needed for intermediate result buffers. size_t get_working_size() const override { return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri)); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index efd984561d..4897bedf47 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -385,9 +385,9 @@ public: // out work in units of out_height. Factor batches into the window, but // not multi for now (as this would cause problems with the buffer // manager). - unsigned int get_window_size() const override { - // _Mround is a multiple of out_height by definition. - return (_Mround / strategy::out_height()) * _nbatches; + ndrange_t get_window_size() const override { + auto m_win_size = (_Mround / strategy::out_height()) * _nbatches; + return { m_win_size, 1u, 1u, 1u, 1u, 1u }; } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -399,7 +399,7 @@ public: } // Execute - void execute(unsigned int start, unsigned int end, int threadid) override { + void execute_1d(unsigned int start, unsigned int end, int threadid) { if (_pretransposed) { execute_internal<true>(start, end, threadid); } else { @@ -407,6 +407,16 @@ public: } } + //Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto stop = work_range.get_position_end(0); + + execute_1d(start, stop, threadid); + } + // Interface implementation - working space size_t get_working_size() const override { // In all cases, we need one A buffer plus a C buffer per thread. diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp new file mode 100644 index 0000000000..53f8e6c938 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_gemm.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" +#include "transform.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +#include <algorithm> +#include <cassert> + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This implementation interleaves the source matrices in blocks - good for +// larger matrices. +namespace arm_gemm { + +template<typename strategy, typename To, typename Tr> +class GemmInterleaved2d : public GemmCommon<To, Tr> { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo * const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const unsigned int _nbatches; + const unsigned int _nmulti; + + const bool _trA; + const bool _trB; + + const Activation _act; + + const int _maxthreads; + int _nthreads; + + /* Blocking info */ + unsigned int _k_block=0; + unsigned int _x_block=0; + + unsigned int _Mround_div=0; + unsigned int _Mround=0; + unsigned int _Nround_div=0; + unsigned int _Nround=0; + + /* Working space, pretransposed buffer */ + void *_working_space=nullptr; + + /* We will need to walk through the blocks of B in a few contexts, so + * factor that out. */ + class blockwalker { + private: + /* Size loops, etc. based on our parent's configuration */ + const GemmInterleaved2d<strategy, To, Tr> &_parent; + + /* K, X and multi parameters for current iteration. */ + unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; + + unsigned int _index=0; + bool _done=false; + bool _newkblock=true; + bool _newmulti=true; + + public: + blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent) + : _parent(parent) + , _xmax { parent._Nsize } + { } + + blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax) + : _parent(parent) + , _x0 { x0 } + , _xmin { x0 } + , _xmax { xmax } + { + assert(_x0 <= _xmax); + } + + unsigned int xmax() { + return std::min(_x0 + _parent._x_block, _xmax); + } + + unsigned int kmax() { + return std::min(_k0 + _parent._k_block, _parent._Ksize); + } + + /* Advance to the next block, return false at the end. */ + bool advance(void) { + if (_done) { + return false; + } + + _newkblock=false; + _x0 += _parent._x_block; + if (_x0 >= _xmax) { + _x0=_xmin; + _k0 += _parent._k_block; + if (_k0 >= _parent._Ksize) { + _k0=0; + _multi++; + if (_multi >= _parent._nmulti) { + _done=true; + return false; + } + _newmulti=true; + } + _newkblock=true; + } + _index++; + + return true; + } + + unsigned int k0(void) { return _k0; } + unsigned int x0(void) { return _x0; } + unsigned int multi(void) { return _multi; } + unsigned int index(void) { return _index; } + bool done(void) { return _done; } + bool newkblock(void) { return _newkblock; } + }; + + // A working size: One of these needed, regardless of thread count. Divided according to window. + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; + } + + // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings. + size_t get_b_working_size() const { + return ROUND_UP(sizeof(Toi) * _x_block * _k_block); + } + + // C working size: One needed per thread. + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); + } + + void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { + UNUSED(mthreadid); + + strategy strat(_ci); + + /* Translate 'start' and 'end' into a position within the batches and rows. */ + const unsigned int window_per_batch = _Mround / strategy::out_height(); + unsigned int batch_0 = m_start / window_per_batch; + unsigned int batch_end = m_end / window_per_batch; + + /* Compute the M values to operate on */ + unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); + unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); + + unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); + unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); + + blockwalker current(*this, n_0, n_max); + + /* get workspace as int8_t */ + assert(_working_space); + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); + + auto c_panel_start = working_space_bytes; + auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; + auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads; + + auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid); + auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * nthreadid); + auto b_panel = reinterpret_cast<Toi *>(b_panel_start + get_b_working_size() * threadid); + + + // newkblock() is always true on the first iteration, so this will be set properly on the first loop. + + int kern_k = 0; + for (;!current.done();current.advance()) { + const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + /* + * The entirity of A^kblock is transpose upfront and computed against individual + * blocks of B (xblock) + * + * Therefore, we only need to retranspose when k_block progresses + */ + if (current.newkblock()) { + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + auto a_thread_panel_in = this->_Aptr + + (batch * this->_A_batch_stride) + + (current.multi() * this->_A_multi_stride); + + auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); + + strat.transforms.PrepareA( + a_thread_panel_out, + a_thread_panel_in, + this->_lda, + first_m, + last_m, + current.k0(), + current.kmax(), + _trA); + } + + kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); + kern_k *= strat.k_unroll(); + } + + auto *b_panel_in = this->_Bptr + (current.multi() * this->_B_multi_stride); + + strat.transforms.PrepareB( + b_panel, //dst + b_panel_in, //src + this->_ldb, + current.x0(), //idx from + current.xmax(), //idx to + current.k0(), + current.kmax(), + _trB); + + //Iterate over the batches + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; + + + //Iterate over the inerleaved rows of the packed A matrix + for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) { + unsigned int ymax = std::min(_Msize, y + strategy::out_height()); + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + a_ptr += (strategy::out_height() * kern_k); + + const bool first_pass = current.k0()==0; + const bool last_pass = current.kmax()==_Ksize; + + auto c_panel_out = this->_Cptr + + this->_C_batch_stride * batch + + this->_C_multi_stride * current.multi(); + + auto bias = (first_pass && this->_bias) + ? this->_bias + (current.multi() * this->_bias_multi_stride) + : nullptr; + + auto act = last_pass ? _act : Activation(); + + strat.transforms.Merge( + c_panel_out, + c_panel, + this->_ldc, + y, + ymax, + current.x0(), + current.xmax(), + bias, + act, + !first_pass); //Append + } + } + } + } +public: + GemmInterleaved2d(GemmInterleaved2d &) = delete; + GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete; + + /* Constructor */ + /* Constructor */ + GemmInterleaved2d(const GemmArgs &args) + : _ci(args._ci) + , _Msize(args._Msize) + , _Nsize(args._Nsize) + , _Ksize(args._Ksize) + , _nbatches(args._nbatches) + , _nmulti(args._nmulti) + , _trA(args._trA) + , _trB(args._trB) + , _act(args._act) + , _maxthreads(args._maxthreads) + , _nthreads(args._maxthreads) + + // Work out the rounded size of M - needed for some buffers. + , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) + , _Mround ( _Mround_div * strategy::out_height() ) + + , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) + , _Nround ( _Nround_div * strategy::out_width() ) + { + const unsigned int L1_size = _ci->get_L1_cache_size(); + const unsigned int L2_size = _ci->get_L2_cache_size(); + + assert(_maxthreads > 0); + + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } + + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } + + // Work out the rounded size of M - needed for some buffers. + } + + // Interface implementation - Compulsory functions + ndrange_t get_window_size() const override { + unsigned m = (_Mround / strategy::out_height()) * _nbatches; + unsigned n = _Nround_div; + + return { m, n, 1u, 1u, 1u, 1u }; + } + + // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. + void set_nthreads(int nthreads) override { + _nthreads = std::min(nthreads, _maxthreads); + } + + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + /* + * This particular GEMM implementation can only be broken up over the M & N + * dimensions, we inform the frame work of this limitation via the get_window_size function + */ + assert(ndrange_popcount(work_range) <= 2); + + const auto m_start = work_range.get_position(0); + const auto n_start = work_range.get_position(1); + const auto m_size = work_range.get_size(0); + const auto n_size = work_range.get_size(1); + const auto m_end = m_start + m_size; + const auto n_end = n_start + n_size; + + const auto m_threadid = thread_locator.get_position(0); + const auto n_threadid = thread_locator.get_position(1); + + execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); + } + + std::size_t get_working_size()const override { + /* + * Because we do not know how schedular will break up + * the task, we need to ensure that alloc enough + * space to be able to handle the case where every thread + * is parallelised across B AND also every thrread is parallelised across A + * + * If we parallelise across A, then we only need one buffer of A and 64 buffers of B + * If we parallelise across B, then we only need 64 buffer of B and + */ + return get_c_working_size() * _maxthreads + + get_a_working_size() * _maxthreads + + get_b_working_size() * _maxthreads + + 64; //to account for cacheline alignment + } + + + void set_working_space(void *working_space) override { + // Make sure everything ends up cache line aligned + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); + intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space); + + size_t diff=0; + + if (working_space_int & 0x3F) { + diff = 0x40 - (working_space_int & 0x3F); + } + + working_space_bytes += diff; + + _working_space = reinterpret_cast<void *>(working_space_bytes); + } + + ~GemmInterleaved2d() override { } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp new file mode 100644 index 0000000000..eff4877198 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_gemm.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" +#include "transform.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +#include <algorithm> +#include <cassert> + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This implementation interleaves the source matrices in blocks - good for +// larger matrices. +namespace arm_gemm { + +template<typename strategy, typename To, typename Tr> +class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo * const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const unsigned int _nbatches; + const unsigned int _nmulti; + + const bool _trA; + const bool _trB; + + const Activation _act; + + const int _maxthreads; + int _nthreads; + + /* Blocking info */ + unsigned int _k_block=0; + unsigned int _x_block=0; + + unsigned int _Mround_div=0; + unsigned int _Mround=0; + unsigned int _Nround_div=0; + unsigned int _Nround=0; + + /* Working space, pretransposed buffer */ + const Toi *_B_transposed=nullptr; + void *_working_space=nullptr; + + /* We will need to walk through the blocks of B in a few contexts, so + * factor that out. */ + class blockwalker { + private: + /* Size loops, etc. based on our parent's configuration */ + const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent; + + /* K, X and multi parameters for current iteration. */ + unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; + + unsigned int _index=0; + bool _done=false; + bool _newkblock=true; + bool _newmulti=true; + + public: + blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent) + : _parent(parent) + , _xmax { parent._Nsize } + { } + + blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax) + : _parent(parent) + , _x0 { x0 } + , _xmin { x0 } + , _xmax { xmax } + { + assert(_x0 <= _xmax); + } + + unsigned int xmax() { + return std::min(_x0 + _parent._x_block, _xmax); + } + + unsigned int kmax() { + return std::min(_k0 + _parent._k_block, _parent._Ksize); + } + + /* Advance to the next block, return false at the end. */ + bool advance(void) { + if (_done) { + return false; + } + + _newkblock=false; + _x0 += _parent._x_block; + if (_x0 >= _xmax) { + _x0=_xmin; + _k0 += _parent._k_block; + if (_k0 >= _parent._Ksize) { + _k0=0; + _multi++; + if (_multi >= _parent._nmulti) { + _done=true; + return false; + } + _newmulti=true; + } + _newkblock=true; + } + _index++; + + return true; + } + + unsigned int k0(void) { return _k0; } + unsigned int x0(void) { return _x0; } + unsigned int multi(void) { return _multi; } + unsigned int index(void) { return _index; } + bool done(void) { return _done; } + bool newkblock(void) { return _newkblock; } + }; + + // A working size: One of these needed, regardless of thread count. Divided according to window. + size_t get_a_working_size() const { + return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; + } + + // As B will be pretranspose we do not need to alloc any space for it + size_t get_b_working_size() const { + return 0; + } + + // C working size: One needed per thread. + size_t get_c_working_size() const { + return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); + } + + // Internal execute function. + // This supports both the "pretransposed" and "standard" interfaces via the template parameter. + void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { + /* Make sure we've been set up correctly. */ + assert(_B_transposed); + assert(_working_space); + assert(this->_Aptr); + assert(this->_Cptr); + + UNUSED(mthreadid); + UNUSED(nthreadid); + +#ifdef CYCLE_PROFILING + profiler prof; +#endif + strategy strat(_ci); + + /* Translate 'start' and 'end' into a position within the batches and rows. */ + const unsigned int window_per_batch = _Mround / strategy::out_height(); + unsigned int batch_0 = m_start / window_per_batch; + unsigned int batch_end = m_end / window_per_batch; + + /* Compute the M values to operate on */ + unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); + unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); + + unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); + unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); + + blockwalker current(*this, n_0, n_max); + + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space); + + auto c_panel_start = working_space_bytes; + auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; + + auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid); + auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid); + + /* B^t is stored in interleaved panels separated by their K-block component + * we want to store a pointer to the start of the current k-page + * then when we come to the next k-block we just add the size of the previous to + * this base pointer + */ + const Toi *b_panel_start = _B_transposed; + // b_panels stores a pointer to the start of our current block inside of the k-block + const Toi *b_panel = b_panel_start; + + // newkblock() is always true on the first iteration, so this will be set properly on the first loop. + unsigned b_page_size = 0; + int kern_k = 0; + for (;!current.done();current.advance()) { + int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + + if (current.newkblock()) { + kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); + kern_k *= strat.k_unroll(); + + unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width()); + + b_panel_start += b_page_size; + b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k); + b_page_size = _Nround * kern_k; + + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + if (first_m >= last_m) + continue; + + auto a_thread_panel_in = this->_Aptr + + (batch * this->_A_batch_stride) + + (current.multi() * this->_A_multi_stride); + + auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); + + strat.transforms.PrepareA( + a_thread_panel_out, + a_thread_panel_in, + this->_lda, + first_m, + last_m, + current.k0(), + current.kmax(), + _trA); + } + } + + /* Do the actual work. */ + for (unsigned int batch = batch_0; batch <= batch_end; batch++) { + unsigned int first_m = (batch == batch_0) ? m_0 : 0; + unsigned int last_m = (batch == batch_end) ? m_max : _Msize; + + const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; + + if (first_m >= last_m) + continue; + + for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) { + unsigned int ymax = std::min(_Msize, y + strategy::out_height()); + + strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k); + a_ptr += (strategy::out_height() * kern_k); + + /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */ + const bool first_pass = current.k0()==0; + const bool last_pass = current.kmax()==_Ksize; + + auto c_panel_out = this->_Cptr + + this->_C_batch_stride * batch + + this->_C_multi_stride * current.multi(); + + auto bias = (first_pass && this->_bias) + ? this->_bias + (current.multi() * this->_bias_multi_stride) + : nullptr; + + auto act = last_pass ? _act : Activation(); + + strat.transforms.Merge( + c_panel_out, + c_panel, + this->_ldc, + y, + ymax, + current.x0(), + current.xmax(), + bias, + act, + !first_pass); //Append + } + } + + b_panel += (bblocks * strat.out_width() * kern_k); + } + } + +public: + GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete; + GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete; + + /* Constructor */ + GemmInterleavedPretransposed2d(const GemmArgs &args) + : _ci(args._ci) + , _Msize(args._Msize) + , _Nsize(args._Nsize) + , _Ksize(args._Ksize) + , _nbatches(args._nbatches) + , _nmulti(args._nmulti) + , _trA(args._trA) + , _trB(args._trB) + , _act(args._act) + , _maxthreads(args._maxthreads) + , _nthreads(args._maxthreads) + + // Work out the rounded size of M - needed for some buffers. + , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) + , _Mround ( _Mround_div * strategy::out_height() ) + + , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) + , _Nround ( _Nround_div * strategy::out_width() ) + { + + assert(args._pretransposed_hint); + assert(_maxthreads > 0); + + const unsigned int L1_size = _ci->get_L1_cache_size(); + const unsigned int L2_size = _ci->get_L2_cache_size(); + + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } + + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } + } + + // Interface implementation - Compulsory functions + ndrange_t get_window_size() const override { + unsigned m = (_Mround / strategy::out_height()) * _nbatches; + unsigned n = _Nround_div; + + return { m, n, 1u, 1u, 1u, 1u }; + } + + // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. + void set_nthreads(int nthreads) override { + _nthreads = std::min(nthreads, _maxthreads); + } + + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + /* This particular GEMM implementation can only be broken up over the M & N + * dimensions, we inform the frame work of this limitation via the get_window_size function + */ + assert(ndrange_popcount(work_range) <= 2); + + const auto m_start = work_range.get_position(0); + const auto n_start = work_range.get_position(1); + const auto m_size = work_range.get_size(0); + const auto n_size = work_range.get_size(1); + const auto m_end = m_start + m_size; + const auto n_end = n_start + n_size; + + const auto m_threadid = thread_locator.get_position(0); + const auto n_threadid = thread_locator.get_position(1); + + execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); + } + + std::size_t get_working_size()const override { + /* Because we do not know how schedular will break up + * the task, we need to ensure that alloc enough + * space to be able to handle the case where every thread + * is parallelised across B AND also every thrread is parallelised across A + * + * If we parallelise across A, then we only need one buffer of A and 64 buffers of B + * If we parallelise across B, then we only need 64 buffer of B and + */ + return get_c_working_size() * _maxthreads + + get_a_working_size() * _maxthreads + + 64; //to account for cacheline alignment + } + + + void set_working_space(void *working_space) override { + // Make sure everything ends up cache line aligned + int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); + intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space); + + size_t diff=0; + + if (working_space_int & 0x3F) { + diff = 0x40 - (working_space_int & 0x3F); + } + + working_space_bytes += diff; + + _working_space = reinterpret_cast<void *>(working_space_bytes); + } + + // Interface implementation - pretransposed + bool B_is_pretransposed() const override { + return true; + } + + bool B_pretranspose_required() const override { + return _B_transposed==nullptr; + } + + // TODO: this could almost certainly be considerably simpler. + size_t get_B_pretransposed_array_size() const override { + size_t total=0; + blockwalker current(*this); + + do { + /* Figure out the size of each block. */ + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width()); + x_size *= strategy::out_width(); + + k_size = iceildiv(k_size, strategy::k_unroll()); + k_size *= strategy::k_unroll(); + + total += x_size * k_size * sizeof(Toi); + } while (current.advance()); + + return total; + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + blockwalker current(*this); + Toi *buffer = reinterpret_cast<Toi *>(in_buffer); + _B_transposed = buffer; + strategy strat(_ci); + + do { + /* Figure out the size of each block. */ + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width()); + x_size *= strategy::out_width(); + + k_size = iceildiv(k_size, strategy::k_unroll()); + k_size *= strategy::k_unroll(); + + strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, + current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + + buffer += (x_size * k_size); + } while (current.advance()); + } + + void set_pretransposed_B_data(void *in_buffer) override { + _B_transposed = reinterpret_cast<Toi *>(in_buffer); + } + + ~GemmInterleavedPretransposed2d() override { } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp index fe6ebef045..c2f742b5cf 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -87,8 +87,8 @@ public: _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { } // Window is amount per multi multiplied by total number of multis. - unsigned int get_window_size() const override { - return _window_range.total_size(); + ndrange_t get_window_size() const override { + return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; } // Native GEMMs can always be dynamically scheduled (whether requested or not) @@ -97,7 +97,7 @@ public: } // Actually execute the GEMM. - void execute(unsigned int start, unsigned int end, int) override { + void execute_1d(unsigned int start, unsigned int end, int) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -139,6 +139,16 @@ public: } } while (p.next_dim1()); } + + //Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto stop = work_range.get_position_end(0); + + execute_1d(start, stop, threadid); + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index be2f5614be..939788ed8d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,7 +58,7 @@ public: UNUSED(ldc); } - unsigned int get_window_size() const override { + ndrange_t get_window_size() const override { return _subgemm->get_window_size(); } @@ -66,8 +66,8 @@ public: _subgemm->set_nthreads(nthreads); } - void execute(unsigned int start, unsigned int end, int threadid) override { - _subgemm->execute(start, end, threadid); + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + _subgemm->execute(work_range, thread_locator, threadid); } size_t get_working_size() const override { diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index 49681ec404..190f4aa643 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,12 +72,12 @@ public: } // Window is number of out_width blocks times number of multis. - unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width()) * _nmultis; + ndrange_t get_window_size() const override { + return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u }; } // Actually execute the GEMV. - void execute(unsigned int start, unsigned int end, int) override { + void execute_1d(unsigned int start, unsigned int end, int) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -127,6 +127,17 @@ public: } } } + + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 26fdfba8ff..7f52ac5a14 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -86,12 +86,12 @@ public: } // Window is number of out_width blocks, times number of multis. - unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width()) * _nmultis; + ndrange_t get_window_size() const override { + return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u }; } // Actually execute the GEMV. - void execute(unsigned int start, unsigned int end, int) override { + void execute_1d(unsigned int start, unsigned int end, int) { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -145,6 +145,17 @@ public: } } + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + UNUSED(thread_locator); + + const auto start = work_range.get_position(0); + const auto size = work_range.get_size(0); + const auto stop = start + size; + + execute_1d(start, stop, threadid); + } + /* Pretransposed interface implementation */ bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp index 20824dfc8b..0c068db011 100644 --- a/src/core/NEON/kernels/arm_gemm/ndrange.hpp +++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,16 +23,19 @@ */ #pragma once +#include <array> #include <algorithm> #include <initializer_list> +#include <cassert> + namespace arm_gemm { template<unsigned int D> class NDRange { private: - unsigned int m_sizes[D]; - unsigned int m_totalsizes[D]; + std::array<unsigned int, D> m_sizes {}; + std::array<unsigned int, D> m_totalsizes {}; class NDRangeIterator { private: @@ -81,8 +84,25 @@ private: }; public: + NDRange& operator=(const NDRange& rhs)=default; + NDRange(const NDRange& rhs) =default; + template <typename... T> - NDRange(T... ts) : m_sizes{ts...} { + NDRange(T... ts) + : m_sizes{ts...} + { + unsigned int t=1; + + for (unsigned int i=0; i<D; i++) { + t *= m_sizes[i]; + + m_totalsizes[i] = t; + } + } + + NDRange(const std::array<unsigned int, D>& n) + : m_sizes{n} + { unsigned int t=1; for (unsigned int i=0; i<D; i++) { @@ -105,4 +125,61 @@ public: } }; +/** NDCoordinate builds upon a range, but specifies a starting position + * in addition to a size which it inherits from NDRange + */ +template<unsigned int N> +class NDCoordinate : public NDRange<N> { + using int_t =unsigned int; + using ndrange_t = NDRange<N>; + + std::array<int_t, N> m_positions {}; +public: + NDCoordinate& operator=(const NDCoordinate& rhs)=default; + NDCoordinate(const NDCoordinate& rhs) =default; + NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list) + { + std::array<int_t, N> sizes; + + std::size_t i = 0; + for(auto& p : list) { + m_positions[i]= p.first; + sizes[i++] = p.second; + } + + //update the parents sizes + static_cast<ndrange_t&>(*this) = ndrange_t(sizes); + } + + int_t get_position(int_t d) const { + assert(d < m_positions.size()); + return m_positions[d]; + } + + void set_position(int_t d, int_t v) { + assert(d < size(m_positions)); + assert(v < ndrange_t::get_size(d)); + + m_positions[d] = v; + } + + int_t get_position_end(int_t d) const { + return get_position(d) + NDRange<N>::get_size(d); + } +}; //class NDCoordinate + +/** @returns the number of dimensions in the NDRange which have none-1 values + * IE there is actual work in these dimensions that can be broken up + */ +template<unsigned int N> +std::size_t ndrange_popcount(const NDRange<N>& ndr) { + std::size_t count = 0; + + for(unsigned int d = 0; d != N; ++d) { + if(ndr.get_size(d) != 1) + ++count; + } + return count; +} + } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp index 345060f206..18f030fec0 100644 --- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -148,7 +148,7 @@ public: set_child_arrays(); } - unsigned int get_window_size() const override { + ndrange_t get_window_size() const override { return _subgemm->get_window_size(); } @@ -158,8 +158,9 @@ public: _args._maxthreads = nthreads; } - void execute(unsigned int start, unsigned int end, int threadid) override { - _subgemm->execute(start, end, threadid); + // Execute + void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + _subgemm->execute(work_range, thread_locator, threadid); if (!_args._pretransposed_hint) { col_sums_runtime(threadid); } diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index e684eeee98..0a03497cb9 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -71,6 +71,61 @@ private: const unsigned int _end; }; +/** Given two dimensions and a maxium number of threads to utilise, calcualte the best + * combination of threads that fit in (mutliplied together) max_threads. + * + * This algorithm assumes that work in either of the dimensions is equally difficult + * to compute + * + * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension + */ +std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n) +{ + /* + * We want the same ratio of threads in M & N to the ratio of m and n problem size + * + * Therefore: mt/nt == m/n where mt*nt == max_threads + * + * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt + * nt^2 = max_threads * (m/n) + * nt = sqrt( max_threads * (m/n) ) + */ + //ratio of m to n in problem dimensions + double ratio = m / static_cast<double>(n); + + // nt = sqrt(max_threads * (m / n) ) + const unsigned adjusted = std::round( + std::sqrt(max_threads * ratio)); + + //find the nearest factor of max_threads + for(unsigned i = 0; i!= adjusted; ++i) + { + //try down + const unsigned adj_down = adjusted - i; + if(max_threads % adj_down == 0) + { + return { adj_down, max_threads / adj_down }; + } + + //try up + const unsigned adj_up = adjusted + i; + if(max_threads % adj_up == 0) + { + return { adj_up, max_threads / adj_up }; + } + } + + //we didn't find anything so lets bail out with maxes biased to the largest dimension + if(m > n) + { + return{ std::min<unsigned>(m, max_threads), 1 }; + } + else + { + return{ 1, std::min<unsigned>(n, max_threads) }; + } +} + /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run. * * Will run workloads until the feeder reaches the end of its range. @@ -314,50 +369,95 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); const Window &max_window = kernel->window(); - const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads); - if(num_iterations == 0) + if(hints.split_dimension() == IScheduler::split_dimensions_all) { - return; - } + /* + * if the split dim is size_t max then this signals we should parallelise over + * all dimensions + */ + const std::size_t m = max_window.num_iterations(Window::DimX); + const std::size_t n = max_window.num_iterations(Window::DimY); + + //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... + unsigned m_threads, n_threads; + std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n); + + std::vector<IScheduler::Workload> workloads; + for(unsigned int ni = 0; ni != n_threads; ++ni) + { + for(unsigned int mi = 0; mi != m_threads; ++mi) + { + workloads.push_back( + [ ni, mi, m_threads, n_threads, &max_window, &kernel ] + (const ThreadInfo & info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); - if(!kernel->is_parallelisable() || num_threads == 1) - { - ThreadInfo info; - info.cpu_info = &_cpu_info; - kernel->run(max_window, info); + win.validate(); + + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + + thread_locator.validate(); + + kernel->run_nd(win, info, thread_locator); + } + ); + } + } + run_workloads(workloads); } else { - unsigned int num_windows = 0; - switch(hints.strategy()) + const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); + const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads); + + if(num_iterations == 0) { - case StrategyHint::STATIC: - num_windows = num_threads; - break; - case StrategyHint::DYNAMIC: - { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); - // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder - num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; - break; - } - default: - ARM_COMPUTE_ERROR("Unknown strategy"); + return; } - std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + + if(!kernel->is_parallelisable() || num_threads == 1) { - //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) + ThreadInfo info; + info.cpu_info = &_cpu_info; + kernel->run(max_window, info); + } + else + { + unsigned int num_windows = 0; + switch(hints.strategy()) + { + case StrategyHint::STATIC: + num_windows = num_threads; + break; + case StrategyHint::DYNAMIC: + { + const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold()); + // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder + num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; + break; + } + default: + ARM_COMPUTE_ERROR("Unknown strategy"); + } + std::vector<IScheduler::Workload> workloads(num_windows); + for(unsigned int t = 0; t < num_windows; t++) { - Window win = max_window.split_window(hints.split_dimension(), t, num_windows); - win.validate(); - kernel->run(win, info); - }; + //Capture 't' by copy, all the other variables by reference: + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) + { + Window win = max_window.split_window(hints.split_dimension(), t, num_windows); + win.validate(); + kernel->run(win, info); + }; + } + run_workloads(workloads); } - run_workloads(workloads); } } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index a3080e7f29..24bd7d7a8c 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -280,8 +280,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 { - const int window_size = _gemm_kernel_asm->get_window_size(); - if(window_size < args._maxthreads) + const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); + if(window_size < static_cast<unsigned int>(args._maxthreads)) { _gemm_kernel_asm->set_nthreads(window_size); } @@ -404,7 +404,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run() if(_workspace.buffer() != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer())); - const unsigned int window_size = _gemm_kernel_asm->get_window_size(); + const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); unsigned int num_threads = NEScheduler::get().num_threads(); if(window_size < num_threads) { @@ -427,14 +427,21 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run() in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d, bias, 0); - // Schedule assembly kernel IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32) { const int granule_threshold = 200; scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); + + } + else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32) + { + //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions + const int granule_threshold = 200; + scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } + NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } |