diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp | 143 |
1 files changed, 118 insertions, 25 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index c75c320a6b..4ad54426e9 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,9 @@ #include <cassert> #include "arm_gemm.hpp" +#include "bfloat.hpp" #include "convolver.hpp" +#include "kernel_weight_format.hpp" #include "mergeresults.hpp" #include "performance_parameters.hpp" #include "quantized.hpp" @@ -56,7 +58,7 @@ namespace { // Others output directly to the matrix result. This helper class calls the // appropriate functions, using templating to avoid calling non-existent // functions. -template<bool MergeStep, typename OutputStage> +template<bool MergeStep, bool FixedFormat, typename OutputStage> class kernel_and_merge { public: template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> @@ -64,7 +66,7 @@ public: #ifdef CYCLE_PROFILING profiler &prof, #endif - strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel, + strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel, Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr, const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias, @@ -74,11 +76,11 @@ public: // Run a kernel and call the separate merge step template<> template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> -void kernel_and_merge<true, Nothing>::run( +void kernel_and_merge<true, false, Nothing>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel, + strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel, Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr, const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *) @@ -101,14 +103,44 @@ void kernel_and_merge<true, Nothing>::run( } } +// Run a fixed-format kernel and call the separate merge step +template<> +template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> +void kernel_and_merge<true, true, Nothing>::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel, + Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, + unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr, + const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *) +{ + { +#ifdef CYCLE_PROFILING + const int bblocks = iceildiv(n_max - n_0, strategy::out_width()); + auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k)); +#endif + + strat.kernel(a_ptr, b_panel, b_stride, c_panel, 1, (n_max - n_0), kern_k); + } + + { +#ifdef CYCLE_PROFILING + const int bblocks = iceildiv(n_max - n_0, strategy::out_width()); + auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr))); +#endif + strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate); + } +} + // Run a kernel with integrated merge template<> template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> -void kernel_and_merge<false, Nothing>::run( +void kernel_and_merge<false, false, Nothing>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - strategy &strat, const To *a_ptr, const To *b_panel, Tri *, + strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *, Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr, const Activation &act, bool accumulate, const Nothing &, const int32_t *, @@ -143,11 +175,11 @@ void kernel_and_merge<false, Nothing>::run( // Run a kernel with integrated merge, quantizing template<> template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> -void kernel_and_merge<false, Requantize32>::run( +void kernel_and_merge<false, false, Requantize32>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - strategy &strat, const To *a_ptr, const To *b_panel, Tri *, + strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *, Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *, const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias, @@ -170,11 +202,11 @@ void kernel_and_merge<false, Requantize32>::run( // Run a kernel and call the separate quantize step template<> template<typename strategy, typename To, typename Tr, typename Tri, typename Tab> -void kernel_and_merge<true, Requantize32>::run( +void kernel_and_merge<true, false, Requantize32>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel, + strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel, Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *, const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias, @@ -246,9 +278,49 @@ public: typedef int32_t type; }; +// Stripe width is a concept only needed for FixedFormat kernels. Use an accessor to avoid issues in other scenarios. +template<typename strategy, bool FixedFormat> +struct get_stripe_width { + static unsigned int get() { + return 0; + } +}; + +template<typename strategy> +struct get_stripe_width<strategy, true> { + static unsigned int get() { + return strategy::stripe_width(); + } +}; + +// KernelWeightFormat is a similar story. +template<typename strategy, bool FixedFormat, typename To> +struct get_kernel_weight_format { + static KernelWeightFormat get() { + return KernelWeightFormat::NON_FIXED; + } +}; + +template<typename strategy, typename To> +struct get_kernel_weight_format<strategy, true, To> { + static KernelWeightFormat get() { + KernelWeightFormat kwf = strategy::kernel_weight_format(); + + // If we are using a BF16 kernel to do an FP32 problem (fast mode) then we need to set the BF16 flag on the + // weight format. + if (std::is_same<To, float>::value && std::is_same<typename strategy::operand_type, bfloat16>::value) { + uint32_t kwf_i = static_cast<uint32_t>(kwf); + kwf_i |= 0x10; + kwf = static_cast<KernelWeightFormat>(kwf_i); + } + + return kwf; + } +}; + } // anonymous namespace -template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false> +template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool FixedFormat=false, bool ForceThreadColumns=false> class GemmInterleaved : public GemmCommon<To, Tr> { typedef typename strategy::operand_type Toi; typedef typename strategy::result_type Tri; @@ -310,7 +382,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> { class blockwalker { private: /* Size loops, etc. based on our parent's configuration */ - const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent; + const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &_parent; /* K, X and multi parameters for current iteration. */ unsigned int _k0=0, _x0=0, _multi=0; @@ -325,9 +397,9 @@ class GemmInterleaved : public GemmCommon<To, Tr> { bool _newmulti=true; public: - blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { } + blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &parent) : _parent(parent) { } - blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent, + blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &parent, unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { } unsigned int xmax() { @@ -666,7 +738,11 @@ public: // Figure out how many "K" the kernel will actually process. unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll()); - const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k); + const Toi *b_ptr = FixedFormat ? + reinterpret_cast<const Toi *>(this->_Bptr) + (multi * this->_B_multi_stride) + + ((start_x / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) + + (k0 * get_stripe_width<strategy, FixedFormat>::get()) : + _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k); unsigned int batch = batch_0; unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height(); @@ -699,12 +775,12 @@ public: } // Perform the kernel and merge step, either separately or together as required. - kernel_and_merge<MergeStep, OutputStage>::run( + kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run( #ifdef CYCLE_PROFILING prof, #endif // Strategy and panel pointers - strat, a_panel, b_ptr, c_panel, + strat, a_panel, b_ptr, this->_ldb, c_panel, // Result buffer pointers this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc, // K size, and M/N ranges @@ -802,6 +878,13 @@ public: } } + // For FixedFormat cases, figure out the B pointer. The loop below moves through batches and vertically through the output so this will be the same throughout. + if (FixedFormat) { + b_panel = reinterpret_cast<const Toi *>(this->_Bptr) + (current.multi() * this->_B_multi_stride) + + ((current.x0() / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) + + (current.k0() * get_stripe_width<strategy, FixedFormat>::get()); + } + /* Do the actual work. */ for (unsigned int batch = batch_0; batch <= batch_end; batch++) { unsigned int first_m = (batch == batch_0) ? m_0 : 0; @@ -840,12 +923,12 @@ public: } // Perform the kernel and merge step, either separately or together as required. - kernel_and_merge<MergeStep, OutputStage>::run( + kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run( #ifdef CYCLE_PROFILING prof, #endif // Strategy and panel pointers - strat, a_ptr, b_panel, c_panel, + strat, a_ptr, b_panel, this->_ldb, c_panel, // Result buffer pointers result_ptr, this->_ldc, // K size, and M/N ranges @@ -863,7 +946,9 @@ public: } } - b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k); + if (FixedFormat == false) { + b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k); + } } } } @@ -910,14 +995,18 @@ public: // Interface implementation - pretransposed bool B_is_pretransposed() const override { - return true; + return (FixedFormat == false); } bool B_pretranspose_required() const override { - return (_B_transposed==nullptr); + return (FixedFormat == false) && (_B_transposed==nullptr); } size_t get_B_pretransposed_array_size() const override { + if (FixedFormat) { + return 0; + } + unsigned int x_size = roundup(_Nsize, strategy::out_width()); return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size(); @@ -939,7 +1028,7 @@ public: void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { requantize_bias(in_buffer, B, ldb, B_multi_stride); - // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 + // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer); Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size()); _B_transposed = buffer; @@ -1005,7 +1094,7 @@ public: } void set_pretransposed_B_data(void *in_buffer) override { - // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 + // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer); _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size()); col_bias = reinterpret_cast<int32_t *>(in_buffer); @@ -1065,6 +1154,7 @@ public: c.inner_block_size = _k_block; c.outer_block_size = _x_block; c.filter = get_type_name<strategy>(); + c.weight_format = get_weight_format(get_kernel_weight_format<strategy, FixedFormat, To>::get(), sizeof(To)); return c; } @@ -1074,6 +1164,9 @@ public: template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing> using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>; +template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing> +using GemmInterleavedFixedFormat = GemmInterleaved<strategy, To, Tr, OutputStage, true, true>; + template<typename strategy, typename To, typename Tr> using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>; |