diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp | 55 |
1 files changed, 24 insertions, 31 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index e5cc79eaed..241c5fea27 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -34,8 +34,8 @@ #include "profiler.hpp" #endif -namespace arm_gemm -{ +namespace arm_gemm { + // Implementation of the GemmCommon abstract class. // // This is implementation is for a "native" (no-transform) GEMV with a @@ -43,53 +43,48 @@ namespace arm_gemm // // As a native operation the source data is used in-place, so the internal // and external operand/result types must match. -template <typename strategy, typename To, typename Tr> -class GemvNativeTransposed : public GemmCommon<To, Tr> -{ +template<typename strategy, typename To, typename Tr> +class GemvNativeTransposed : public GemmCommon<To, Tr> { typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; + typedef typename strategy::result_type Tri; const unsigned int _Nsize; const unsigned int _Ksize; + const unsigned int _nmultis; const Tr _beta; - const CPUInfo *const _ci; + const CPUInfo * const _ci; - unsigned int m_block = 0; - unsigned int n_block = 0; + unsigned int m_block=0; + unsigned int n_block=0; public: GemvNativeTransposed(GemvNativeTransposed &) = delete; - GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete; + GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete; - GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) - : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) - { + GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) { /* For now don't do any blocking. TODO: figure out if we should. */ m_block = K; n_block = N; } // Window is number of out_width blocks times number of multis. - unsigned int get_window_size() const override - { + unsigned int get_window_size() const override { return iceildiv(_Nsize, strategy::out_width) * _nmultis; } // Actually execute the GEMV. - void execute(unsigned int start, unsigned int end, int) override - { + void execute(unsigned int start, unsigned int end, int) override { #ifdef CYCLE_PROFILING profiler prof; #endif - strategy strat(_ci); const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width); - const unsigned int multi_0 = start / window_per_multi; - const unsigned int multi_end = end / window_per_multi; + const unsigned int multi_0 = start / window_per_multi; + const unsigned int multi_end = end / window_per_multi; const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width; const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width; @@ -97,27 +92,25 @@ public: static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same."); static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same."); - for(unsigned int multi = multi_0; multi <= multi_end; multi++) - { - const unsigned int n_start = (multi == multi_0) ? n_0 : 0; - const unsigned int n_end = (multi == multi_end) ? n_max : _Nsize; + for (unsigned int multi=multi_0; multi<=multi_end; multi++) { + const unsigned int n_start = (multi==multi_0) ? n_0 : 0; + const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize; - if(n_end <= n_start) + if (n_end <= n_start) continue; - for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block) - { + for (unsigned int m0=0; m0<_Ksize; m0+=m_block) { unsigned int mmax = std::min(m0 + m_block, _Ksize); - for(unsigned int n0 = n_start; n0 < n_end; n0 += n_block) - { + + for (unsigned int n0=n_start; n0<n_end; n0+=n_block) { unsigned int nmax = std::min(n0 + n_block, n_end); #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n0)); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n0)); #endif strat.kernel(this->_Bptr + (multi * this->_B_multi_stride) + (m0 * this->_ldb) + n0, this->_Aptr + (multi * this->_A_multi_stride) + m0, this->_Cptr + (multi * this->_C_multi_stride) + n0, - _beta, this->_ldb, (mmax - m0), (nmax - n0)); + _beta, this->_ldb, (mmax-m0), (nmax-n0)); } } } |