From 1d480652b820317fc97ccbc3cb517e3b9e8be197 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 23 Jan 2019 11:24:50 +0000 Subject: COMPMID-1867: Add u8 and s8 hybrid assembly kernels. Change-Id: Ifeb005f9d18d19feff11949474cce84d9e03749c Reviewed-on: https://review.mlplatform.org/565 Reviewed-by: Michalis Spyrou Tested-by: Arm Jenkins --- .../core/NEON/kernels/assembly/gemm_common.hpp | 122 +- src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 260 +-- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 11 +- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 13 +- src/core/NEON/kernels/arm_gemm/gemm_native.hpp | 91 +- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 11 +- src/core/NEON/kernels/arm_gemm/gemv_batched.hpp | 2 + .../kernels/arm_gemm/gemv_native_transposed.hpp | 8 +- .../NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 21 +- .../kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp | 8 +- .../arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp | 77 + .../kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp | 2271 ++++++++++++++++++++ .../kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp | 1605 ++++++++++++++ .../arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp | 77 + .../kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp | 2271 ++++++++++++++++++++ .../kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp | 1605 ++++++++++++++ .../kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp | 8 +- .../a64_sgemm_nativeA_pretransposeB_16x4.hpp | 8 +- .../arm_gemm/kernels/a64_sgemm_native_16x4.hpp | 8 +- .../arm_gemm/kernels/a64_sgemv_pretransposed.hpp | 25 +- .../kernels/arm_gemm/kernels/a64_sgemv_trans.hpp | 11 +- .../arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp | 8 +- .../kernels/sve_interleaved_fp16_mla_3VLx8.hpp | 6 +- .../kernels/sve_interleaved_fp32_mla_3VLx8.hpp | 6 +- .../kernels/sve_interleaved_s8s32_dot_3VLx8.hpp | 6 +- .../kernels/sve_interleaved_u8u32_dot_3VLx8.hpp | 6 +- .../arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp | 8 +- .../kernels/sve_native_s8s32_dot_4VLx4.hpp | 8 +- .../kernels/sve_native_u8u32_dot_4VLx4.hpp | 8 +- .../arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp | 8 +- .../kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp | 8 +- src/core/NEON/kernels/arm_gemm/ndrange.hpp | 108 + src/core/NEON/kernels/arm_gemm/utils.hpp | 5 +- 40 files changed, 8393 insertions(+), 361 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/ndrange.hpp diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp index 7b4f0149e3..c72f210e56 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,42 +34,19 @@ namespace arm_gemm { // working space (permute as they go along). This interface should support // all of them. -template -class GemmCommon { -protected: - const To *_Aptr=nullptr; - int _lda=0; - int _A_batch_stride=0; - int _A_multi_stride=0; - const To *_Bptr=nullptr; - int _ldb=0; - int _B_multi_stride=0; - Tr *_Cptr=nullptr; - int _ldc=0; - int _C_batch_stride=0; - int _C_multi_stride=0; - +// The real GemmCommon class is templated based on the operand and return +// type. This is an interface class which is independent of those types. +class IGemmCommon { public: /* Pass in the pointers to the arrays to be operated on and their - * strides. This has a default implementation that just captures them - * all in protected members. If B is pretransposed (see below) then the - * settings for B here are ignored. + * strides. In the interface class these are passed as void pointers - + * the templated version overloads this function with a version which + * takes appropriately typed pointers. If B is pretransposed (see + * below) then the settings for B here are ignored. */ - virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) { - _Aptr = A; - _lda = lda; - _A_batch_stride = A_batch_stride; - _A_multi_stride = A_multi_stride; - _Bptr = B; - _ldb = ldb; - _B_multi_stride = B_multi_stride; - _Cptr = C; - _ldc = ldc; - _C_batch_stride = C_batch_stride; - _C_multi_stride = C_multi_stride; - } + virtual void set_arrays(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride) = 0; /* For threading, we divide the work into some number of units and work * out internally what unit corresponds to what work. This returns the @@ -90,6 +67,9 @@ public: */ virtual void set_nthreads(int) { }; + /* Whether this GEMM can be dynamically scheduled or not. */ + virtual bool supports_dynamic_scheduling() const { return false; } + /* Actually do the work. Provide a threadid to index any per-thread * buffers, and a start/end range to indicate which work to do. */ virtual void execute(unsigned int, unsigned int, int) = 0; @@ -107,14 +87,78 @@ public: virtual bool B_pretranspose_required() const { return false; } /* Total number of bytes of space needed for pretransposed arrays. */ virtual size_t get_B_pretransposed_array_size() const { return 0; } - /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ - /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) { }; + /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ + /* The "real" version of this depends on the templated operand type (see below). */ + virtual void pretranspose_B_array(void *, const void *, const int, const int) = 0; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ virtual void set_pretransposed_B_data(void *) { } // Destructor - virtual ~GemmCommon() { } + virtual ~IGemmCommon() { } +}; + +/* + * "Real" GemmCommon class which is templated on the operand and return types. + * + * In addition to correctly typed versions of the functions that operate on + * operand and return data, this class provides a default implementation of + * 'set_arrays' to capture the provided arguments in protected class + * members, as essentially any implementation will need these. + */ +template +class GemmCommon : public IGemmCommon { +protected: + const To *_Aptr=nullptr; + int _lda=0; + int _A_batch_stride=0; + int _A_multi_stride=0; + const To *_Bptr=nullptr; + int _ldb=0; + int _B_multi_stride=0; + Tr *_Cptr=nullptr; + int _ldc=0; + int _C_batch_stride=0; + int _C_multi_stride=0; + +public: + /* Pass in the pointers to the arrays to be operated on and their + * strides (templated version with appropriate types). */ + virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, + const To *B, const int ldb, /* batches share B */ const int B_multi_stride, + Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) { + _Aptr = A; + _lda = lda; + _A_batch_stride = A_batch_stride; + _A_multi_stride = A_multi_stride; + _Bptr = B; + _ldb = ldb; + _B_multi_stride = B_multi_stride; + _Cptr = C; + _ldc = ldc; + _C_batch_stride = C_batch_stride; + _C_multi_stride = C_multi_stride; + } + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void set_arrays(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override { + set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, + static_cast(B), ldb, B_multi_stride, + static_cast(C), ldc, C_batch_stride, C_multi_stride); + } + + /*** "Pretransposed" interface ***/ + + /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ + /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ + virtual void pretranspose_B_array(void *, const To *, const int, const int) { }; + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void pretranspose_B_array(void *out, const void *in, const int row_stride, const int multi_stride) override { + pretranspose_B_array(out, static_cast(in), row_stride, multi_stride); + } + }; -} // namespace arm_gemm +} // namespace arm_gemm \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index 09f03c6332..c2bd0bb882 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -28,6 +28,7 @@ #include #include "arm_gemm.hpp" +#include "ndrange.hpp" #include "utils.hpp" #include "mergeresults.hpp" @@ -60,69 +61,66 @@ class GemmHybrid : public GemmCommon { const Tr _beta; /* Blocking info */ - unsigned int _k_block=0; - unsigned int _x_block=0; - unsigned int _Mround=0; + const unsigned int _k_block; + const unsigned int _n_block; + const unsigned int _Mround; /* Pretransposed buffer. */ const Toi *_B_transposed=nullptr; - unsigned int _B_per_multi = 0; + const NDRange<4> _window_range; - /* We will need to walk through the blocks of B in a few contexts, so - * factor that out. */ - class blockwalker { - private: - /* Size loops, etc. based on our parent's configuration */ - const GemmHybrid &_parent; + static unsigned int compute_k_block(const GemmArgs &args) { + if (args._cfg && args._cfg->inner_block_size) { + return args._cfg->inner_block_size; + } - /* K, X and multi parameters for current iteration. */ - unsigned int _k0=0, _x0=0; + const unsigned int L1_size = args._ci->get_L1_cache_size(); - unsigned int _index=0; - bool _done=false; - bool _newkblock=true; + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - public: - blockwalker(const GemmHybrid &parent) : _parent(parent) { } + // Needs to be (at least a single) multiple of the K unroll level. + k_block /= strategy::k_unroll(); + k_block = std::max(k_block, 1U) * strategy::k_unroll(); - unsigned int xmax() { - return std::min(_x0 + _parent._x_block, _parent._Nsize); - } + // Now tune to presented problem size; this is how many blocks we need. + unsigned int numk_blocks = iceildiv(args._Ksize, k_block); - unsigned int kmax() { - return std::min(_k0 + _parent._k_block, _parent._Ksize); - } + // So divide the space equally into that many blocks. + k_block = iceildiv(args._Ksize, numk_blocks); - /* Advance to the next block, return false at the end. */ - bool advance(void) { - if (_done) { - return false; - } + // And round UP to the K unroll level required. + k_block = roundup(k_block, strategy::k_unroll()); - _newkblock=false; - _x0 += _parent._x_block; - if (_x0 >= _parent._Nsize) { - _x0=0; - _k0 += _parent._k_block; - if (_k0 >= _parent._Ksize) { - _done=true; - return false; - } - _newkblock=true; - } - _index++; + return k_block; + } - return true; + static unsigned int compute_n_block(const GemmArgs &args) { + if (args._cfg && args._cfg->outer_block_size) { + return args._cfg->outer_block_size; } - unsigned int k0(void) { return _k0; } - unsigned int x0(void) { return _x0; } - unsigned int index(void) { return _index; } - bool done(void) { return _done; } - bool newkblock(void) { return _newkblock; } - }; + const unsigned int k_block = compute_k_block(args); + const unsigned int L2_size = args._ci->get_L2_cache_size(); + // n_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int numblocks = iceildiv(args._Nsize, n_block); + n_block = iceildiv(args._Nsize, numblocks); + n_block = roundup(n_block, strategy::out_width()); + + return n_block; + } public: GemmHybrid(GemmHybrid &) = delete; @@ -130,71 +128,20 @@ public: /* Constructor */ GemmHybrid(const GemmArgs &args) - : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), - _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) { - const unsigned int L1_size = _ci->get_L1_cache_size(); - const unsigned int L2_size = _ci->get_L2_cache_size(); - - _B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) * - (iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll()); - - // Work out blocking parameters, or override from config. - - if (args._cfg && args._cfg->inner_block_size) { - _k_block = args._cfg->inner_block_size; - } else { - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - - // Needs to be (at least a single) multiple of the K unroll level. - _k_block /= strategy::k_unroll(); - _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); - - // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = iceildiv(_Ksize, _k_block); - - // So divide the space equally into that many blocks. - _k_block = iceildiv(_Ksize, num_k_blocks); - - // And round UP to the K unroll level required. - _k_block = iceildiv(_k_block, strategy::k_unroll()); - _k_block *= strategy::k_unroll(); - } - - if (args._cfg && args._cfg->outer_block_size) { - _x_block = args._cfg->outer_block_size; - } else { - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); - - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); - - // And tune to the presented problem size. - int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); - - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); - } - - // Work out the rounded size of M - needed for some buffers. - _Mround = iceildiv(_Msize, strategy::out_height()); - _Mround *= strategy::out_height(); - } + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), + _Mround(roundup(args._Msize, strategy::out_height())), + _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { } // Interface implementation - Compulsory functions - - // Window size: Only the last thread should do a ragged block, so dole - // out work in units of out_height. Factor batches and multi into the - // window too. unsigned int get_window_size() const override { - // _Mround is a multiple of out_height by definition. - return (_Mround / strategy::out_height()) * _nbatches * _nmulti; + return _window_range.total_size(); + } + + // This kernel can always be dynamically scheduled. + bool supports_dynamic_scheduling() const override { + return true; } // Execute @@ -206,50 +153,45 @@ public: /* Make sure we've been set up correctly. */ assert(_B_transposed); - - const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height()); - const unsigned int window_per_multi = window_per_batch * _nbatches; - - const unsigned int first_multi = start / window_per_multi; - const unsigned int last_multi = end / window_per_multi; - - const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch; - const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch; - - const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height(); - const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height(); - static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); static_assert(std::is_same::value, "gemm_native: Result types must be the same."); - for (unsigned int multi = first_multi; multi <= last_multi; multi++) { - const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0; - const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches - 1); + /* For now, each work item implies all the K for a given output + * pixel (so we don't need to synchronize access to the output + * array). So separate the loop over K blocks here. */ + for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { + unsigned int kmax = std::min(k0 + _k_block, _Ksize); + unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll()); - const Toi *b_panel = _B_transposed + (multi * _B_per_multi); + auto p = _window_range.iterator(start, end); - for (blockwalker current(*this); !current.done(); current.advance()) { - int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); - kern_k *= strat.k_unroll(); + if (p.done()) { + return; + } - int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + do { + const unsigned int m_start = p.dim(0) * strategy::out_height(); + const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize); + const unsigned int batch = p.dim(1); + const unsigned int n0 = p.dim(2) * _n_block; + const unsigned int nmax = std::min(n0 + _n_block, _Nsize); + const unsigned int multi = p.dim(3); + + const Toi *b_panel = _B_transposed + + (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) + + (k0 * roundup(_Nsize, strategy::out_width())) + + (n0 * kern_k); - for (unsigned int batch = batch_0; batch <= batch_max; batch++) { - const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0; - const unsigned int m_end = ((multi == last_multi) && (batch == last_batch) ) ? last_row : _Msize; #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width()); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif - strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda, - b_panel, - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc, - (current.k0() == 0) ? _beta : static_cast(1), - (m_end - m_start), (current.xmax() - current.x0()), kern_k); - } - - b_panel += (bblocks * strat.out_width() * kern_k); - } + strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, + b_panel, + this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, + (k0 == 0) ? _beta : static_cast(1), + (m_end - m_start), (nmax - n0), kern_k); + } while (p.next_dim1()); } } @@ -263,35 +205,31 @@ public: } size_t get_B_pretransposed_array_size() const override { - return _B_per_multi * _nmulti * sizeof(Toi); + return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi); } + using GemmCommon::pretranspose_B_array; void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { Toi *buffer = reinterpret_cast(in_buffer); _B_transposed = buffer; strategy strat(_ci); - for (unsigned int multi=0; multi < _nmulti; multi++) { - blockwalker current(*this); - - do { - /* Figure out the size of each block. */ - size_t x_size = (current.xmax() - current.x0()); - size_t k_size = (current.kmax() - current.k0()); + for (unsigned int multi=0; multi<_nmulti; multi++) { + for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { + const unsigned int kmax = std::min(k0 + _k_block, _Ksize); + const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll()); - /* Round sizes up as needed. */ - x_size = iceildiv(x_size, strategy::out_width()); - x_size *= strategy::out_width(); + for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) { + const unsigned int xmax = std::min(x0+_n_block, _Nsize); - k_size = iceildiv(k_size, strategy::k_unroll()); - k_size *= strategy::k_unroll(); + const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size; - strat.transforms.PrepareB( - buffer, B + (multi * B_multi_stride), ldb, - current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb, + x0, xmax, k0, kmax, _trB); - buffer += (x_size * k_size); - } while (current.advance()); + buffer += size; + } + } } } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index 34dc8bc341..5811c2a1ce 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #include "arm_gemm.hpp" #include "gemm_common.hpp" +#include "gemm_hybrid.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" #include "gemm_native.hpp" @@ -32,6 +33,7 @@ #include "kernels/a64_gemm_s16_12x8.hpp" #include "kernels/a64_gemm_s8_12x8.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" +#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp" #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp" #include "kernels/sve_native_s8s32_dot_4VLx4.hpp" @@ -54,6 +56,13 @@ static const GemmImplementation gemm_s8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif +{ + GemmMethod::GEMM_HYBRID, + "hybrid_s8s32_dot_16x4", + [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; }, + [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args) { return new GemmHybrid(args); } +}, { GemmMethod::GEMM_INTERLEAVED, "gemm_s8_12x8", diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 436438f351..b83ccd3407 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -340,7 +340,7 @@ public: _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = iceildiv(_Ksize, _k_block); + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); // So divide the space equally into that many blocks. _k_block = iceildiv(_Ksize, num_k_blocks); @@ -363,7 +363,7 @@ public: _x_block = std::max(_x_block, 1U) * strategy::out_width(); // And tune to the presented problem size. - int num_x_blocks = iceildiv(_Nsize, _x_block); + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); _x_block = iceildiv(_Nsize, num_x_blocks); _x_block = iceildiv(_x_block, strategy::out_width()); @@ -464,8 +464,8 @@ public: do { /* Figure out the size of each block. */ - size_t x_size = (current.xmax() - current.x0()); - size_t k_size = (current.kmax() - current.k0()); + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); /* Round sizes up as needed. */ x_size = iceildiv(x_size, strategy::out_width()); @@ -480,6 +480,7 @@ public: return total; } + using GemmCommon::pretranspose_B_array; void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { blockwalker current(*this); Toi *buffer = reinterpret_cast(in_buffer); @@ -488,8 +489,8 @@ public: do { /* Figure out the size of each block. */ - size_t x_size = (current.xmax() - current.x0()); - size_t k_size = (current.kmax() - current.k0()); + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); /* Round sizes up as needed. */ x_size = iceildiv(x_size, strategy::out_width()); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp index 579533418d..98516b1ca6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,8 +27,7 @@ #include "arm_gemm.hpp" -#include "mergeresults.hpp" -#include "transform.hpp" +#include "ndrange.hpp" #ifdef CYCLE_PROFILING #include "profiler.hpp" @@ -55,19 +54,25 @@ class GemmNative : public GemmCommon { const unsigned int _nbatches; const unsigned int _nmultis; - Tr _beta; + const Tr _beta; const CPUInfo * const _ci; - unsigned int k_block=0; - unsigned int n_block=0; + const unsigned int _k_block; + const unsigned int _n_block; - unsigned int window_per_batch() const { - return iceildiv(_Msize, strategy::out_height()); + const NDRange<4> _window_range; + + static unsigned int compute_k_block(const GemmArgs &args) { + return args._Ksize; } - unsigned int window_per_multi() const { - return window_per_batch() * _nbatches; + static unsigned int compute_n_block(const GemmArgs &args) { + if ((args._cfg != nullptr) && args._cfg->outer_block_size > 0) { + return args._cfg->outer_block_size; + } else { + return args._Nsize; + } } public: @@ -75,15 +80,20 @@ public: GemmNative & operator= (GemmNative &) = delete; GemmNative(const GemmArgs &args) - : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) { - /* For now don't do any blocking. TODO: figure out if we should. */ - k_block = _Ksize; - n_block = _Nsize; - } + : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmultis(args._nmulti), + _beta(args._beta), _ci(args._ci), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), + _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { } // Window is amount per multi multiplied by total number of multis. unsigned int get_window_size() const override { - return window_per_multi() * _nmultis; + return _window_range.total_size(); + } + + // Native GEMMs can always be dynamically scheduled (whether requested or not) + bool supports_dynamic_scheduling() const override { + return true; } // Actually execute the GEMM. @@ -96,45 +106,30 @@ public: static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); static_assert(std::is_same::value, "gemm_native: Result types must be the same."); - /* Compute starting point based on 'start' */ - unsigned int multi = start / window_per_multi(); - unsigned int multi_pos = start % window_per_multi(); + auto p = _window_range.iterator(start, end); - unsigned int batch = multi_pos / window_per_batch(); - unsigned int batch_pos = multi_pos % window_per_batch(); - - unsigned int y0 = batch_pos * strategy::out_height(); - - for (unsigned int l=end-start; l>0; ) { - // Do work from here to the end of the current batch/multi - const unsigned int ymax = std::min(y0 + (l * strategy::out_height()), _Msize); + if (p.done()) { + return; + } - // Work out how many units this is and subtract from loop counter. - l -= ((ymax - y0) + (strategy::out_height() - 1)) / strategy::out_height(); + do { + unsigned int y0 = p.dim(0) * strategy::out_height(); + unsigned int ymax = std::min(p.dim0_max() * strategy::out_height(), _Msize); + unsigned int batch = p.dim(1); + unsigned int n0 = p.dim(2) * _n_block; + unsigned int nmax = std::min(n0 + _n_block, _Nsize); + unsigned int multi = p.dim(3); #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * (nmax - n0) * _Ksize); #endif strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda, - this->_Bptr + (multi * this->_B_multi_stride), this->_ldb, - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc, - _beta, (ymax-y0), _Nsize, _Ksize); - - /* Advance to next item */ - y0 = ymax; - - /* Check for batch/multi overflow */ - if (y0 >= _Msize) { - y0=0; - batch++; - if (batch == _nbatches) { - batch=0; - multi++; - } - } - } + this->_Bptr + (multi * this->_B_multi_stride) + n0, this->_ldb, + this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc, + _beta, (ymax-y0), (nmax - n0), _Ksize); + } while (p.next_dim1()); } }; -} // namespace arm_gemm +} // namespace arm_gemm \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index 3c8df3f044..b95ca8016b 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,11 +27,13 @@ #include "gemm_common.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" +#include "gemm_hybrid.hpp" #include "gemm_native.hpp" #include "kernels/a64_gemm_u16_12x8.hpp" #include "kernels/a64_gemm_u8_12x8.hpp" #include "kernels/a64_gemm_u8_4x4.hpp" +#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp" #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp" #include "kernels/sve_native_u8u32_dot_4VLx4.hpp" @@ -54,6 +56,13 @@ static const GemmImplementation gemm_u8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif +{ + GemmMethod::GEMM_HYBRID, + "hybrid_u8u32_dot_16x4", + [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; }, + [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs &args) { return new GemmHybrid(args); } +}, { GemmMethod::GEMM_INTERLEAVED, "gemm_u8_12x8", diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index 40f7f2b7cd..32d668f66d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -44,6 +44,7 @@ public: _subgemm = gemm(newargs); } + using GemmCommon::set_arrays; void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, const To *B, const int ldb, const int B_multi_stride, Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override { @@ -85,6 +86,7 @@ public: return _subgemm->get_B_pretransposed_array_size(); } + using GemmCommon::pretranspose_B_array; void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override { _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride); } diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index 5cf42761e6..5ebc6342d7 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -73,7 +73,7 @@ public: // Window is number of out_width blocks times number of multis. unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width) * _nmultis; + return iceildiv(_Nsize, strategy::out_width()) * _nmultis; } // Actually execute the GEMV. @@ -83,12 +83,12 @@ public: #endif strategy strat(_ci); - const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width); + const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; const unsigned int multi_end = end / window_per_multi; - const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width; - const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width; + const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width(); + const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width(); static_assert(std::is_same::value, "gemv_transposed: Operand types must be the same."); static_assert(std::is_same::value, "gemv_transposed: Result types must be the same."); diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 842339ef23..f7beb0a34c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -70,7 +70,7 @@ public: GemvPretransposed(const GemmArgs &args) : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci), - _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) { + _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) { /* For now don't do any blocking. TODO: figure out if we should. */ if (args._cfg && args._cfg->inner_block_size) { m_block = args._cfg->inner_block_size; @@ -87,7 +87,7 @@ public: // Window is number of out_width blocks, times number of multis. unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width) * _nmultis; + return iceildiv(_Nsize, strategy::out_width()) * _nmultis; } // Actually execute the GEMV. @@ -98,13 +98,13 @@ public: strategy strat(_ci); /* Break the window values down into multis of interest... */ - const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width); + const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; const unsigned int multi_end = end / window_per_multi; /* ... and figure out where we start and end in the first and last multi. */ - const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width; - const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width; + const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width(); + const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width(); static_assert(std::is_same::value, "GemvPretransposed: Result types must be the same."); @@ -124,8 +124,8 @@ public: auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n)); #endif /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */ - strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave), - (_Ksize * strategy::A_interleave), + strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()), + (_Ksize * strategy::A_interleave()), this->_Aptr + (multi * this->_A_multi_stride) + m0, this->_Cptr + (multi * this->_C_multi_stride) + n, _beta, (mmax-m0), (nmax-n)); @@ -148,6 +148,7 @@ public: return _buffer_per_multi * _nmultis * sizeof(To); } + using GemmCommon::pretranspose_B_array; void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override { Toi *A_buffer = reinterpret_cast(buffer); @@ -155,10 +156,10 @@ public: /* Reverse sense here as we are dealing with B rather than A. So if * strategy::A_transpose is false and _trB is false, we still * transpose. */ - if (_trB ^ strategy::A_transpose) { - Transform(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); + if (_trB ^ strategy::A_transpose()) { + Transform(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); } else { - Transform(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); + Transform(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp index 06e62456dc..234972270c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,15 +50,15 @@ public: typedef void (*kern_type)(const float *, const float *, float *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 8; } - static int out_height() { + static unsigned int out_height() { return 6; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp index 95a2bc2fbc..2fcb587df1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,15 +48,15 @@ public: typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp index fdc0200435..cc205dc6e3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,15 +43,15 @@ public: typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp index be7ead9f48..71c666ad00 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,15 +42,15 @@ public: typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 4; } - static int out_height() { + static unsigned int out_height() { return 4; } - static int k_unroll() { + static unsigned int k_unroll() { return 16; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp index d2692ba77f..3d5c92c622 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,15 +48,15 @@ public: typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp index a252abfd3e..9032ba67b3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -53,15 +53,15 @@ public: static const bool B_transpose = true; /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp index 2da3ecd4f8..fda7657b2b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,15 +50,15 @@ public: static const bool B_transpose = true; /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 4; } - static int out_height() { + static unsigned int out_height() { return 4; } - static int k_unroll() { + static unsigned int k_unroll() { return 16; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp index 911a4ebb01..5b850b7a20 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -47,15 +47,15 @@ public: typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 24; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp new file mode 100644 index 0000000000..c8934dff8a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int); +void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int); + +class hybrid_s8s32_dot_16x4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static unsigned int k_unroll() + { + return 4; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8s32_dot_16x4; + + hybrid_s8s32_dot_16x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_hybrid_s8s32_dot_16x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp new file mode 100644 index 0000000000..48bf842ca5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int); +void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int); + +class hybrid_u8u32_dot_16x4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static unsigned int k_unroll() + { + return 4; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_u8u32_dot_16x4; + + hybrid_u8u32_dot_16x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_hybrid_u8u32_dot_16x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp new file mode 100644 index 0000000000..230ecdce2d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0u); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0u); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp index 9d88b60cee..2ca4ce25e8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length<__fp16>() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp index 2e8f261fe1..8c1fe6d0b6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const float *, const float *, float *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp index 67154e6a3f..cbb21387b1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp index 628c5a868e..99c039e121 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp index fcc80d9fe5..d7f9f20074 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp index f5634e3618..8b98358cd4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp index f5ebad8565..bcbd3d35f5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,17 +43,17 @@ public: typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp index 80b216ca14..06622d6f2e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length() * 1; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp index aa2c522382..022efdfc26 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length() * 1; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp new file mode 100644 index 0000000000..20824dfc8b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include +#include + +namespace arm_gemm { + +template +class NDRange { +private: + unsigned int m_sizes[D]; + unsigned int m_totalsizes[D]; + + class NDRangeIterator { + private: + const NDRange &m_parent; + unsigned int m_pos = 0; + unsigned int m_end = 0; + + public: + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } + + bool done() const { + return (m_pos >= m_end); + } + + unsigned int dim(unsigned int d) const { + unsigned int r = m_pos; + + if (d < (D - 1)) { + r %= m_parent.m_totalsizes[d]; + } + + if (d > 0) { + r /= m_parent.m_totalsizes[d-1]; + } + + return r; + } + + bool next_dim0() { + m_pos++; + + return !done(); + } + + bool next_dim1() { + m_pos += m_parent.m_sizes[0] - dim(0); + + return !done(); + } + + unsigned int dim0_max() const { + unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); + + return dim(0) + offset; + } + }; + +public: + template + NDRange(T... ts) : m_sizes{ts...} { + unsigned int t=1; + + for (unsigned int i=0; i +inline T iceildiv(const T a, const T b) { return (a + b - 1) / b; } -- cgit v1.2.1