diff options
author | Francesco.Petrogalli@arm.com <francesco.petrogalli@arm.com> | 2022-04-05 10:31:08 +0000 |
---|---|---|
committer | Francesco Petrogalli <francesco.petrogalli@arm.com> | 2022-05-24 14:28:27 +0000 |
commit | 5fcf22dadf092efd7aafb359f9229aa270eb1129 (patch) | |
tree | f309426ed19bd6710329da3b530167db72d1c6b2 /src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | |
parent | a8caa023f0d7b71b3a250a14ceee935052fcc74a (diff) | |
download | ComputeLibrary-5fcf22dadf092efd7aafb359f9229aa270eb1129.tar.gz |
[arm_gemm] Import fixed-format kernels from gemm_linux.
This is a No Functional Change Intended (NFCI) patch. It imports the
kernel in the code, but the interface to select them and expose the
format of the weight tensors to the user will be provided in a
subsequent patch.
Kernels and kernel selection code in arm_gemm has been provided
by David.Mansell <David.Mansell@arm.com>.
The kernels are not compiled in the library by default, but need to be
selected via the `scons` option `experimental_fixed_format_kernels=1`.
Resolves: ONCPUML-829
Signed-off-by: Francesco.Petrogalli@arm.com <francesco.petrogalli@arm.com>
Change-Id: If00ccb2b9b7221e01b214cf9783111226ccc8bf4
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7380
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 140 |
1 files changed, 116 insertions, 24 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 5b3ef4203d..c41b0a5b3e 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -33,6 +33,7 @@ #include "arm_gemm.hpp" #include "bias_adder.hpp" #include "convolver.hpp" +#include "kernel_weight_format.hpp" #include "ndrange.hpp" #include "performance_parameters.hpp" #include "transform.hpp" @@ -54,7 +55,7 @@ namespace { // We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do // that. -template<typename OutputStage, bool SeparateQuantize = false> +template<typename OutputStage, bool SeparateQuantize, bool FixedFormat> class run_hybrid_kernel { public: template<typename strategy, typename Tlo, typename Tro, typename Tr> @@ -63,18 +64,18 @@ public: profiler &prof, #endif const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, const OutputStage &os, const int32_t *col_bias, unsigned int n_0 ); }; template<> template<typename strategy, typename Tlo, typename Tro, typename Tr> -inline void run_hybrid_kernel<Nothing, false>::run( +inline void run_hybrid_kernel<Nothing, false, false>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, const Nothing &, const int32_t *, unsigned int) { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); @@ -115,12 +116,60 @@ inline void run_hybrid_kernel<Nothing, false>::run( template<> template<typename strategy, typename Tlo, typename Tro, typename Tr> -inline void run_hybrid_kernel<Requantize32, false>::run( +inline void run_hybrid_kernel<Nothing, false, true>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, + unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const Nothing &, const int32_t *, unsigned int) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); +#endif + UNUSED(kern_k); + + /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing + * a partial block and pad the bias for that block. */ + if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) { + /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */ + unsigned int N_remainder = N % strategy::out_width(); + unsigned int N_bulk = N - N_remainder; + + /* Output argument to be used for the tail */ + IndirectOutputArg<Tr> offset_output = output_arg; + + /* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */ + if (N_bulk > 0) { + strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate); + + if (output_arg.is_indirect) { + offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk); + } else { + offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride); + } + } + + /* Pad the bias buffer for the remainder */ + Tr *bias_pad_buffer = reinterpret_cast<Tr *>(alloca(strategy::out_width() * sizeof(Tr))); + memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr)); + + /* Process the remainder, offsetting the B pointer as needed. */ + strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder, + b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output, + bias_pad_buffer, act, accumulate); + } else { + strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate); + } +} + +template<> +template<typename strategy, typename Tlo, typename Tro, typename Tr> +inline void run_hybrid_kernel<Requantize32, false, false>::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); @@ -132,12 +181,12 @@ inline void run_hybrid_kernel<Requantize32, false>::run( template<> template<typename strategy, typename Tlo, typename Tro, typename Tr> -inline void run_hybrid_kernel<Requantize32, true>::run( +inline void run_hybrid_kernel<Requantize32, true, false>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, + unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { UNUSED(kern_k); // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop. @@ -180,10 +229,38 @@ inline void run_hybrid_kernel<Requantize32, true>::run( } } +template<typename strategy, bool FixedFormat> +struct stripe_width { + static unsigned int get() { + return strategy::stripe_width(); + } +}; + +template<typename strategy> +struct stripe_width<strategy, false> { + static unsigned int get() { + return 0; + } +}; + +template<typename strategy, bool FixedFormat> +struct kernel_weight_format { + static KernelWeightFormat get() { + return strategy::kernel_weight_format(); + } +}; + +template<typename strategy> +struct kernel_weight_format<strategy, false> { + static KernelWeightFormat get() { + return KernelWeightFormat::NON_FIXED; + } +}; + } // anonymous namespace // Implementation of the GemmCommon abstract class. -template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false> +template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false> class GemmHybridIndirect : public GemmCommon<To, Tr> { typedef typename strategy::lhs_operand_type Tloi; typedef typename strategy::rhs_operand_type Troi; @@ -425,24 +502,32 @@ public: const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize); const unsigned int multi = p.dim(3); - const Troi *b_panel = _B_transposed + - (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) + - (k0 * roundup(_args._Nsize, strategy::out_width())) + - (n0 * kern_k); + const Troi *b_panel; + if (FixedFormat) { + b_panel = reinterpret_cast<const Troi *>(this->_Bptr) + + (multi * this->_B_multi_stride) + + ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) + + (k0 * stripe_width<strategy, FixedFormat>::get()); + } else { + b_panel = _B_transposed + + (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) + + (k0 * roundup(_args._Nsize, strategy::out_width())) + + (n0 * kern_k); + } - IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc); + IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc); #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif if (_indirect_buf) { - run_hybrid_kernel<OutputStage, SeparateQuantize>::run( + run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run( #ifdef CYCLE_PROFILING prof, #endif strat, sections, string_lengths.data(), IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset), - (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), !first_pass, @@ -469,13 +554,13 @@ public: } assert(pos == sections); - run_hybrid_kernel<OutputStage, SeparateQuantize>::run( + run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run( #ifdef CYCLE_PROFILING prof, #endif strat, sections, string_lengths.data(), IndirectInputArg<To>(in_row_strings.data(), 0, first_offset), - (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), !first_pass, @@ -485,13 +570,13 @@ public: // Length to process. This needs to exclude padding, but 'kmax' potentially includes it. const unsigned int len = (std::min(_args._Ksize, kmax) - k0); - run_hybrid_kernel<OutputStage, SeparateQuantize>::run( + run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run( #ifdef CYCLE_PROFILING prof, #endif strat, 1, &len, IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda), - (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), !first_pass, @@ -504,14 +589,18 @@ public: // Interface implementation - pretransposed bool B_is_pretransposed() const override { - return true; + return (FixedFormat == false); } bool B_pretranspose_required() const override { - return (_B_transposed==nullptr); + return (FixedFormat == false) && (_B_transposed==nullptr); } size_t get_B_pretransposed_array_size() const override { + if (FixedFormat) { + return 0; + } + // Start with actual pretransposed buffer... size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi); @@ -599,8 +688,7 @@ public: } } } else { - // In the single K section case, can process the whole lot in one go. - // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize. + // In the single K section case, can process the whole lot in one go. strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, 0, _args._Nsize, k0, std::min(kmax, _args._Ksize)); buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll()); @@ -694,11 +782,15 @@ public: c.inner_block_size = _k_block; c.outer_block_size = _n_block; c.filter = get_type_name<strategy>(); + c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To)); return c; } }; +template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing> +using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>; + } // namespace arm_gemm #ifdef __I_DEFINED_UNUSED |