From 5aa1a0b7ca5eed010e4b297a95b1c4851f741328 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 2 Jul 2020 20:02:20 +0100 Subject: COMPID-3324: Clean GEMM kernels Signed-off-by: Georgios Pinitas Change-Id: I170de1671e061a78740caee31fb4a1b8642c1369 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3505 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio --- src/core/NEON/kernels/assembly/arm_gemm.hpp | 106 +++++++++-------- src/core/NEON/kernels/assembly/gemm_common.hpp | 150 +++++++++++------------ src/core/NEON/kernels/assembly/ndrange.hpp | 158 ++++++++++++++----------- 3 files changed, 220 insertions(+), 194 deletions(-) (limited to 'src/core/NEON/kernels/assembly') diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp index 7723224ec8..2df7132500 100644 --- a/src/core/NEON/kernels/assembly/arm_gemm.hpp +++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp @@ -23,14 +23,14 @@ */ #pragma once -#include #include +#include #include "arm_gemm_local.hpp" #include "gemm_common.hpp" -namespace arm_gemm { - +namespace arm_gemm +{ enum class GemmMethod { DEFAULT, @@ -47,12 +47,17 @@ enum class GemmMethod struct KernelDescription { - GemmMethod method = GemmMethod::DEFAULT; - std::string name = ""; - bool is_default = false; + GemmMethod method = GemmMethod::DEFAULT; + std::string name = ""; + bool is_default = false; - KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { } - KernelDescription() noexcept { } + KernelDescription(GemmMethod m, std::string n, bool d = false) + : method(m), name(n), is_default(d) + { + } + KernelDescription() noexcept + { + } }; struct GemmConfig @@ -62,23 +67,32 @@ struct GemmConfig unsigned int inner_block_size = 0; unsigned int outer_block_size = 0; - GemmConfig(GemmMethod method) : method(method) { } - GemmConfig() { } + GemmConfig(GemmMethod method) + : method(method) + { + } + GemmConfig() + { + } }; struct Activation { - enum class Type { + enum class Type + { None, ReLU, BoundedReLU }; - Type type; - float param1; - float param2; + Type type; + float param1; + float param2; - Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { } + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) + : type(type), param1(p1), param2(p2) + { + } }; struct GemmArgs @@ -101,10 +115,8 @@ public: const unsigned int K, const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB, Activation act, const int maxthreads, - const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) : - _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), - _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads), - _pretransposed_hint(pretransposed_hint), _cfg(cfg) + const bool pretransposed_hint, const GemmConfig *cfg = nullptr) + : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads), _pretransposed_hint(pretransposed_hint), _cfg(cfg) { } }; @@ -112,18 +124,18 @@ public: struct Requantize32 { public: - const int32_t *bias = nullptr; - size_t bias_multi_stride = 0; - int32_t a_offset = 0; - int32_t b_offset = 0; - int32_t c_offset = 0; - bool per_channel_requant = false; - int32_t per_layer_shift = 0; - int32_t per_layer_mul = 0; - const int32_t *per_channel_shifts = nullptr; - const int32_t *per_channel_muls = nullptr; - int32_t minval = 0; - int32_t maxval = 0; + const int32_t *bias = nullptr; + size_t bias_multi_stride = 0; + int32_t a_offset = 0; + int32_t b_offset = 0; + int32_t c_offset = 0; + bool per_channel_requant = false; + int32_t per_layer_shift = 0; + int32_t per_layer_mul = 0; + const int32_t *per_channel_shifts = nullptr; + const int32_t *per_channel_muls = nullptr; + int32_t minval = 0; + int32_t maxval = 0; Requantize32() = default; @@ -131,11 +143,9 @@ public: Requantize32(const int32_t *bias, size_t bias_multi_stride, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t requant_shift, int32_t requant_mul, - int32_t minv, int32_t maxv) : - bias(bias), bias_multi_stride(bias_multi_stride), - a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), - per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul), - minval(minv), maxval(maxv) + int32_t minv, int32_t maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul), + minval(minv), maxval(maxv) { } @@ -143,11 +153,9 @@ public: Requantize32(const int32_t *bias, size_t bias_multi_stride, int32_t a_offset, int32_t b_offset, int32_t c_offset, const int32_t *requant_shifts, const int32_t *requant_muls, - int32_t minv, int32_t maxv) : - bias(bias), bias_multi_stride(bias_multi_stride), - a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), - per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls), - minval(minv), maxval(maxv) + int32_t minv, int32_t maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_shifts(requant_shifts), + per_channel_muls(requant_muls), minval(minv), maxval(maxv) { } }; @@ -156,21 +164,21 @@ struct Nothing { }; -template -using UniqueGemmCommon = std::unique_ptr >; +template +using UniqueGemmCommon = std::unique_ptr>; /* Low level API calls. * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */ /* get_gemm_method(): Given the templated types and provided parameters, * which is the preferred method to implement this GEMM? */ -template -KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={}); +template +KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {}); -template -UniqueGemmCommon gemm(const GemmArgs &args, const OutputStage & ={}); +template +UniqueGemmCommon gemm(const GemmArgs &args, const OutputStage & = {}); -template -std::vector get_compatible_kernels(const GemmArgs &args, const OutputStage & ={}); +template +std::vector get_compatible_kernels(const GemmArgs &args, const OutputStage & = {}); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp index a44b774b9d..3b4c025371 100644 --- a/src/core/NEON/kernels/assembly/gemm_common.hpp +++ b/src/core/NEON/kernels/assembly/gemm_common.hpp @@ -23,15 +23,12 @@ */ #pragma once -#include "arm_gemm_compute_iface.hpp" +#include "ndrange.hpp" #include -#include - -#define UNUSED(x) (void)(x) - -namespace arm_gemm { +namespace arm_gemm +{ // Abstract class for the GEMM/GEMV functions. // // GEMM implementations may be "native" (never require any input @@ -41,7 +38,8 @@ namespace arm_gemm { // The real GemmCommon class is templated based on the operand and return // type. This is an interface class which is independent of those types. -class IGemmCommon { +class IGemmCommon +{ public: /* Pass in the pointers to the arrays to be operated on and their * strides. This "generic" version uses void *s, the preferred version @@ -50,9 +48,9 @@ public: * the settings for B here are ignored. */ virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; /** @returns an ndrange containing ranges of the compute space which can be * broken up and parallelised over @@ -71,47 +69,64 @@ public: * This has an empty default implementation, as GEMMs which don't care * about thread count can safely ignore this. */ - virtual void set_nthreads(int) { }; + virtual void set_nthreads(int) {}; /* Whether this GEMM can be dynamically scheduled or not. */ - virtual bool supports_dynamic_scheduling() const { return false; } + virtual bool supports_dynamic_scheduling() const + { + return false; + } /** Main execute member fucntion * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() * @param [in] thread_locator where are we inside of the thread space * @naram [in] threadid a unique threadid */ - virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0; + virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0; /*** Working space interface (optional) ***/ /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ - virtual size_t get_working_size() const { return 0; } + virtual size_t get_working_size() const + { + return 0; + } /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void set_working_space(void *) { }; + virtual void set_working_space(void *) {}; /*** "Pretransposed" interface (optional) ***/ /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ - virtual bool B_is_pretransposed() const { return false; } + virtual bool B_is_pretransposed() const + { + return false; + } /* Does pretranspose still need to be done? */ - virtual bool B_pretranspose_required() const { return false; } + virtual bool B_pretranspose_required() const + { + return false; + } /* Total number of bytes of space needed for pretransposed arrays. */ - virtual size_t get_B_pretransposed_array_size() const { return 0; } + virtual size_t get_B_pretransposed_array_size() const + { + return 0; + } /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ /* The "real" version of this depends on the templated operand type (see below). */ virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ - virtual void set_pretransposed_B_data(void *) { } + virtual void set_pretransposed_B_data(void *) + { + } /*** "Quantized bias" interface (optional) ***/ /* Set the bias vector for quantized GEMMs */ - virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) + virtual void set_quantized_bias(const int32_t *, size_t) { - UNUSED(bias); - UNUSED(bias_multi_stride); } // Destructor - virtual ~IGemmCommon() { } + virtual ~IGemmCommon() + { + } }; /* "Real" GemmCommon class which is templated on the operand and return types. @@ -121,50 +136,53 @@ public: * 'set_arrays' to capture the provided arguments in protected class * members, as essentially any implementation will need these. */ -template -class GemmCommon : public IGemmCommon { +template +class GemmCommon : public IGemmCommon +{ protected: - const To *_Aptr=nullptr; - int _lda=0; - int _A_batch_stride=0; - int _A_multi_stride=0; - const To *_Bptr=nullptr; - int _ldb=0; - int _B_multi_stride=0; - Tr *_Cptr=nullptr; - int _ldc=0; - int _C_batch_stride=0; - int _C_multi_stride=0; - const Tr *_bias=nullptr; - int _bias_multi_stride=0; + const To *_Aptr = nullptr; + int _lda = 0; + int _A_batch_stride = 0; + int _A_multi_stride = 0; + const To *_Bptr = nullptr; + int _ldb = 0; + int _B_multi_stride = 0; + Tr *_Cptr = nullptr; + int _ldc = 0; + int _C_batch_stride = 0; + int _C_multi_stride = 0; + const Tr *_bias = nullptr; + int _bias_multi_stride = 0; public: /* Pass in the pointers to the arrays to be operated on and their * strides (templated version with appropriate types). */ virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) { - _Aptr = A; - _lda = lda; - _A_batch_stride = A_batch_stride; - _A_multi_stride = A_multi_stride; - _Bptr = B; - _ldb = ldb; - _B_multi_stride = B_multi_stride; - _Cptr = C; - _ldc = ldc; - _C_batch_stride = C_batch_stride; - _C_multi_stride = C_multi_stride; - _bias = bias; + const To *B, const int ldb, /* batches share B */ const int B_multi_stride, + Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) + { + _Aptr = A; + _lda = lda; + _A_batch_stride = A_batch_stride; + _A_multi_stride = A_multi_stride; + _Bptr = B; + _ldb = ldb; + _B_multi_stride = B_multi_stride; + _Cptr = C; + _ldc = ldc; + _C_batch_stride = C_batch_stride; + _C_multi_stride = C_multi_stride; + _bias = bias; _bias_multi_stride = bias_multi_stride; } /* Implementation of the void * overload which casts its arguments to the appropriate type. */ void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override { + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override + { set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, static_cast(B), ldb, B_multi_stride, static_cast(C), ldc, C_batch_stride, C_multi_stride, @@ -175,27 +193,13 @@ public: /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) { }; + virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override { + void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override + { pretranspose_B_array(out, static_cast(in), row_stride, multi_stride); } }; -template -inline -int unsigned get_total_window_size(const GemmKernel& kernel) -{ - auto window=kernel.get_window_size(); - - unsigned int total = 1; - for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i) - { - total *= window.get_size(i); - } - - return total; -} - } // namespace arm_gemm diff --git a/src/core/NEON/kernels/assembly/ndrange.hpp b/src/core/NEON/kernels/assembly/ndrange.hpp index d082a3e9b8..86638298ab 100644 --- a/src/core/NEON/kernels/assembly/ndrange.hpp +++ b/src/core/NEON/kernels/assembly/ndrange.hpp @@ -23,104 +23,123 @@ */ #pragma once -#include #include -#include - +#include #include +#include -namespace arm_gemm { - -template -class NDRange { +namespace arm_gemm +{ +template +class NDRange +{ private: - std::array m_sizes {}; - std::array m_totalsizes {}; + std::array m_sizes{}; + std::array m_totalsizes{}; - class NDRangeIterator { + class NDRangeIterator + { private: const NDRange &m_parent; - unsigned int m_pos = 0; - unsigned int m_end = 0; + unsigned int m_pos = 0; + unsigned int m_end = 0; public: - NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) + : m_parent(p), m_pos(s), m_end(e) + { + } - bool done() const { + bool done() const + { return (m_pos >= m_end); } - unsigned int dim(unsigned int d) const { + unsigned int dim(unsigned int d) const + { unsigned int r = m_pos; - if (d < (D - 1)) { + if(d < (D - 1)) + { r %= m_parent.m_totalsizes[d]; } - if (d > 0) { - r /= m_parent.m_totalsizes[d-1]; + if(d > 0) + { + r /= m_parent.m_totalsizes[d - 1]; } return r; } - bool next_dim0() { + bool next_dim0() + { m_pos++; return !done(); } - bool next_dim1() { + bool next_dim1() + { m_pos += m_parent.m_sizes[0] - dim(0); return !done(); } - unsigned int dim0_max() const { + unsigned int dim0_max() const + { unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); return dim(0) + offset; } }; -public: - NDRange& operator=(const NDRange& rhs)=default; - NDRange(const NDRange& rhs) =default; - - template - NDRange(T... ts) - : m_sizes{ts...} + void set_totalsizes() { - unsigned int t=1; + unsigned int t = 1; + + for(unsigned int i = 0; i < D; i++) + { + if(m_sizes[i] == 0) + { + m_sizes[i] = 1; + } - for (unsigned int i=0; i& n) - : m_sizes(n) - { - unsigned int t=1; +public: + NDRange &operator=(const NDRange &rhs) = default; + NDRange(const NDRange &rhs) = default; - for (unsigned int i=0; i + NDRange(T... ts) + : m_sizes{ ts... } + { + set_totalsizes(); + } - m_totalsizes[i] = t; - } + NDRange(const std::array &n) + : m_sizes(n) + { + set_totalsizes(); } - NDRangeIterator iterator(unsigned int start, unsigned int end) const { + NDRangeIterator iterator(unsigned int start, unsigned int end) const + { return NDRangeIterator(*this, start, end); } - unsigned int total_size() const { + unsigned int total_size() const + { return m_totalsizes[D - 1]; } - unsigned int get_size(unsigned int v) const { + unsigned int get_size(unsigned int v) const + { return m_sizes[v]; } }; @@ -128,58 +147,53 @@ public: /** NDCoordinate builds upon a range, but specifies a starting position * in addition to a size which it inherits from NDRange */ -template -class NDCoordinate : public NDRange { - using int_t =unsigned int; +template +class NDCoordinate : public NDRange +{ + using int_t = unsigned int; using ndrange_t = NDRange; - std::array m_positions {}; + std::array m_positions{}; + public: - NDCoordinate& operator=(const NDCoordinate& rhs)=default; - NDCoordinate(const NDCoordinate& rhs) =default; - NDCoordinate(const std::initializer_list>& list) + NDCoordinate &operator=(const NDCoordinate &rhs) = default; + NDCoordinate(const NDCoordinate &rhs) = default; + NDCoordinate(const std::initializer_list> &list) { std::array sizes{}; std::size_t i = 0; - for(auto& p : list) { - m_positions[i]= p.first; - sizes[i++] = p.second; + for(auto &p : list) + { + m_positions[i] = p.first; + sizes[i++] = p.second; } //update the parents sizes - static_cast(*this) = ndrange_t(sizes); + static_cast(*this) = ndrange_t(sizes); } - int_t get_position(int_t d) const { - assert(d < m_positions.size()); + int_t get_position(int_t d) const + { + assert(d < N); + return m_positions[d]; } - void set_position(int_t d, int_t v) { - assert(d < size(m_positions)); - assert(v < ndrange_t::get_size(d)); + void set_position(int_t d, int_t v) + { + assert(d < N); m_positions[d] = v; } - int_t get_position_end(int_t d) const { - return get_position(d) + NDRange::get_size(d); + int_t get_position_end(int_t d) const + { + return get_position(d) + ndrange_t::get_size(d); } }; //class NDCoordinate -/** @returns the number of dimensions in the NDRange which have none-1 values - * IE there is actual work in these dimensions that can be broken up - */ -template -std::size_t ndrange_popcount(const NDRange& ndr) { - std::size_t count = 0; - - for(unsigned int d = 0; d != N; ++d) { - if(ndr.get_size(d) != 1) - ++count; - } - return count; -} +using ndrange_t = NDRange<6>; +using ndcoord_t = NDCoordinate<6>; } // namespace arm_gemm -- cgit v1.2.1