diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 347 |
1 files changed, 258 insertions, 89 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp index 41fecc6bec..0cc4d4f3d9 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,9 @@ */ #pragma once +#if !defined(_WIN64) && !defined(__OpenBSD__) #include <alloca.h> +#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */ #include <algorithm> #include <cassert> @@ -31,6 +33,7 @@ #include "arm_gemm.hpp" #include "bias_adder.hpp" #include "convolver.hpp" +#include "kernel_weight_format.hpp" #include "ndrange.hpp" #include "performance_parameters.hpp" #include "transform.hpp" @@ -52,34 +55,34 @@ namespace { // We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do // that. -template<typename OutputStage, bool SeparateQuantize = false> +template<typename OutputStage, bool SeparateQuantize, bool FixedFormat> class run_hybrid_kernel { public: - template<typename strategy, typename To, typename Tr> - static void run ( + template<typename strategy, typename Tlo, typename Tro, typename Tr> + static inline void run ( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, const OutputStage &os, const int32_t *col_bias, unsigned int n_0 ); }; template<> -template<typename strategy, typename To, typename Tr> -void run_hybrid_kernel<Nothing, false>::run( +template<typename strategy, typename Tlo, typename Tro, typename Tr> +inline void run_hybrid_kernel<Nothing, false, false>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, const Nothing &, const int32_t *, unsigned int) { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); #endif UNUSED(kern_k); - /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing + /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing * a partial block and pad the bias for that block. */ if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) { /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */ @@ -112,13 +115,61 @@ void run_hybrid_kernel<Nothing, false>::run( } template<> -template<typename strategy, typename To, typename Tr> -void run_hybrid_kernel<Requantize32, false>::run( +template<typename strategy, typename Tlo, typename Tro, typename Tr> +inline void run_hybrid_kernel<Nothing, false, true>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate, + const Nothing &, const int32_t *, unsigned int) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); +#endif + UNUSED(kern_k); + + /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing + * a partial block and pad the bias for that block. */ + if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) { + /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */ + unsigned int N_remainder = N % strategy::out_width(); + unsigned int N_bulk = N - N_remainder; + + /* Output argument to be used for the tail */ + IndirectOutputArg<Tr> offset_output = output_arg; + + /* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */ + if (N_bulk > 0) { + strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate); + + if (output_arg.is_indirect) { + offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk); + } else { + offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride); + } + } + + /* Pad the bias buffer for the remainder */ + Tr *bias_pad_buffer = reinterpret_cast<Tr *>(alloca(strategy::out_width() * sizeof(Tr))); + memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr)); + + /* Process the remainder, offsetting the B pointer as needed. */ + strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder, + b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output, + bias_pad_buffer, act, accumulate); + } else { + strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate); + } +} + +template<> +template<typename strategy, typename Tlo, typename Tro, typename Tr> +inline void run_hybrid_kernel<Requantize32, false, false>::run( +#ifdef CYCLE_PROFILING + profiler &prof, +#endif + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width())); @@ -129,13 +180,13 @@ void run_hybrid_kernel<Requantize32, false>::run( } template<> -template<typename strategy, typename To, typename Tr> -void run_hybrid_kernel<Requantize32, true>::run( +template<typename strategy, typename Tlo, typename Tro, typename Tr> +inline void run_hybrid_kernel<Requantize32, true, false>::run( #ifdef CYCLE_PROFILING profiler &prof, #endif - const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N, - unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, + const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N, + unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool, const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) { UNUSED(kern_k); // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop. @@ -178,12 +229,41 @@ void run_hybrid_kernel<Requantize32, true>::run( } } +template<typename strategy, bool FixedFormat> +struct stripe_width { + static unsigned int get() { + return strategy::stripe_width(); + } +}; + +template<typename strategy> +struct stripe_width<strategy, false> { + static unsigned int get() { + return 0; + } +}; + +template<typename strategy, bool FixedFormat> +struct kernel_weight_format { + static KernelWeightFormat get() { + return strategy::kernel_weight_format(); + } +}; + +template<typename strategy> +struct kernel_weight_format<strategy, false> { + static KernelWeightFormat get() { + return KernelWeightFormat::NON_FIXED; + } +}; + } // anonymous namespace // Implementation of the GemmCommon abstract class. -template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false> +template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false> class GemmHybridIndirect : public GemmCommon<To, Tr> { - typedef typename strategy::operand_type Toi; + typedef typename strategy::lhs_operand_type Tloi; + typedef typename strategy::rhs_operand_type Troi; typedef typename strategy::result_type Tri; GemmArgs _args; @@ -201,7 +281,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> { const unsigned int _Mround; /* Pretransposed buffer. */ - const Toi *_B_transposed=nullptr; + const Troi *_B_transposed=nullptr; /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */ const To * const * const * _indirect_buf = nullptr; @@ -233,7 +313,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> { } if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; + return roundup(args._cfg->inner_block_size, strategy::k_unroll()); } // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other @@ -356,11 +436,11 @@ public: // In convolution mode, we need input pointers. if (_convolver) { - in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr); - in_row_strings.resize(_args._Ksections, nullptr); + in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr); + in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr); for (unsigned int i=0; i<_args._Ksections; i++) { - in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]); + in_row_strings[i] = &(in_row_ptrs.data()[i * strategy::out_height()]); } } @@ -370,8 +450,8 @@ public: } /* Make sure we've been set up correctly. */ - assert(_B_transposed); - static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same."); + assert(FixedFormat || _B_transposed); + static_assert(std::is_same<To, Tloi>::value, "gemm_native: Operand types must be the same."); // static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same."); /* For now, each work item implies all the K for a given output @@ -422,27 +502,35 @@ public: const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize); const unsigned int multi = p.dim(3); - const Toi *b_panel = _B_transposed + - (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) + - (k0 * roundup(_args._Nsize, strategy::out_width())) + - (n0 * kern_k); + const Troi *b_panel; + if (FixedFormat) { + b_panel = reinterpret_cast<const Troi *>(this->_Bptr) + + (multi * this->_B_multi_stride) + + ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) + + (k0 * stripe_width<strategy, FixedFormat>::get()); + } else { + b_panel = _B_transposed + + (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) + + (k0 * roundup(_args._Nsize, strategy::out_width())) + + (n0 * kern_k); + } - IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc); + IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc); #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif if (_indirect_buf) { - run_hybrid_kernel<OutputStage, SeparateQuantize>::run( + run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run( #ifdef CYCLE_PROFILING prof, #endif strat, sections, string_lengths.data(), IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset), - (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), - !first_pass, + !first_pass || _args._accumulate, // Quantization parameters _os, _col_bias+(multi * _args._Nsize), n0); } else if (_convolver) { @@ -466,32 +554,32 @@ public: } assert(pos == sections); - run_hybrid_kernel<OutputStage, SeparateQuantize>::run( + run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run( #ifdef CYCLE_PROFILING prof, #endif strat, sections, string_lengths.data(), IndirectInputArg<To>(in_row_strings.data(), 0, first_offset), - (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), - !first_pass, + !first_pass || _args._accumulate, // Quantization parameters _os, _col_bias+(multi * _args._Nsize), n0); } else { // Length to process. This needs to exclude padding, but 'kmax' potentially includes it. const unsigned int len = (std::min(_args._Ksize, kmax) - k0); - run_hybrid_kernel<OutputStage, SeparateQuantize>::run( + run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run( #ifdef CYCLE_PROFILING prof, #endif strat, 1, &len, IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda), - (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg, + (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg, (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, last_pass ? _args._act : Activation(), - !first_pass, + !first_pass || _args._accumulate, // Quantization parameters _os, _col_bias+(multi * _args._Nsize), n0); } @@ -501,16 +589,20 @@ public: // Interface implementation - pretransposed bool B_is_pretransposed() const override { - return true; + return (FixedFormat == false); } bool B_pretranspose_required() const override { - return (_B_transposed==nullptr); + return (FixedFormat == false) && (_B_transposed==nullptr); } size_t get_B_pretransposed_array_size() const override { + if (FixedFormat) { + return 0; + } + // Start with actual pretransposed buffer... - size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi); + size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi); // Space for result row pointers (not strictly needed any more but retained for indirect output testing) size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *); @@ -522,7 +614,11 @@ public: return size; } - void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + size_t get_B_pretranspose_window_size() const override { + return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width()); + } + + void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { if (std::is_same<OutputStage, Requantize32>::value) { _col_bias = reinterpret_cast<int32_t *>(in_buffer); @@ -533,62 +629,115 @@ public: compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0); } } + } + + bool B_pretranspose_supports_transpose() const override { + strategy strat(_args._ci); + return strat.transforms.PrepareB_supports_transpose(); + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override { + pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size()); + } + + void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override { + if (end >= get_B_pretranspose_window_size()) { + requantize_bias(in_buffer, B, ldb, B_multi_stride); + } // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer); - Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size()); - _B_transposed = buffer; + Troi *buffer_base = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size()); + _B_transposed = buffer_base; strategy strat(_args._ci); + size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width()); + + for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) { + // Work out which part of the window space this multi occupies, + // skip to the next multi or exit as needed. + size_t wk_start = multi * work_per_multi; + size_t wk_end = (multi + 1) * work_per_multi; + + assert(wk_end > start); + + if (wk_start >= end) { + break; + } - for (unsigned int multi=0; multi<_args._nmulti; multi++) { for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) { const unsigned int kmax=std::min(k0 + _k_block, _Ktotal); /* Figure out the size of each block. */ unsigned int k_size = kmax - k0; - // We need to insert padding at the end of each K section. - // The computation needed is a little delicate - the coordinates from the block walker are expressed in - // terms of the full, padded, _Ktotal. - // But we need to transform each section with reference to the original, unpadded, input, letting the - // transform pad each section as needed. - - // This is needed for computations below. - const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll()); - - // The expected output format is also an entire <out_width> columns interleaved, then the next set of - // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at - // a time. - for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){ - unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); - - // Track where we are and how much work is left. - unsigned int kpos = k0; - unsigned int kleft = k_size; - - while (kleft) { - // Which section are we in? Based on the rounded-up section size. - unsigned int k_section_base = kpos / rounded_section_size; - // How far into the section are we? - unsigned int k_offset = kpos - (k_section_base * rounded_section_size); - - // We will either copy the rest of this section, or to the end of the requested length. - unsigned int k_length = std::min(_args._Ksize - k_offset, kleft); - - strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, - x0, xmax, - (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. - (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above. + // Correct the N range and buffer base if we are not processing the whole block. + size_t n_start = 0; + size_t n_end = _args._Nsize; - // We need to modify our position based on the ROUNDED version of what we just did. - unsigned int padded_length = roundup(k_length, strategy::k_unroll()); + // If we are not doing the first columns, update the buffer write position and starting N value. + if (start > wk_start) { + n_start = (start - wk_start) * strategy::out_width(); + } - buffer += strategy::out_width() * padded_length; + // If we are not doing the last items, update the final N value. + if (end < wk_end) { + n_end = (end - wk_start) * strategy::out_width(); + } - kpos += padded_length; - kleft -= padded_length; + // Set the buffer pointer + Troi *buffer = buffer_base + + (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) + + (n_start * roundup(k_size, strategy::k_unroll())); + + if (_args._Ksections > 1) { + // We need to insert padding at the end of each K section. + // The computation needed is a little delicate - the k0/kmax coordinates are expressed in + // terms of the full, padded, _Ktotal. + // But we need to transform each section with reference to the original, unpadded, input, letting the + // transform pad each section as needed. + + // This is needed for computations below. + const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll()); + + // The expected output format is also an entire <out_width> columns interleaved, then the next set of + // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at + // a time. + for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) { + unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize); + + // Track where we are and how much work is left. + unsigned int kpos = k0; + unsigned int kleft = k_size; + + while (kleft) { + // Which section are we in? Based on the rounded-up section size. + unsigned int k_section_base = kpos / rounded_section_size; + // How far into the section are we? + unsigned int k_offset = kpos - (k_section_base * rounded_section_size); + + // We will either copy the rest of this section, or to the end of the requested length. + unsigned int k_length = std::min(_args._Ksize - k_offset, kleft); + + strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, + x0, xmax, + (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length. + (k_section_base * _args._Ksize) + k_offset + k_length, // K end point - starting point plus length computed above. + transposed); + + // We need to modify our position based on the ROUNDED version of what we just did. + unsigned int padded_length = roundup(k_length, strategy::k_unroll()); + + buffer += strategy::out_width() * padded_length; + + kpos += padded_length; + kleft -= padded_length; + } } + } else { + // In the single K section case, can process the whole lot in one go. + strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb, + n_start, n_end, k0, std::min(kmax, _args._Ksize), transposed); } } } @@ -597,12 +746,17 @@ public: void set_pretransposed_B_data(void *in_buffer) override { // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer); - _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size()); + _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size()); _col_bias = reinterpret_cast<int32_t *>(in_buffer); } - // Estimate cycles for given problem given provided parameters - static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms, const OutputStage &os = {} ) { + // Estimate cycles for given problem given provided parameters. + // "perf_type" is a type to pass along to get_performance_parameters to get the right set of performance + // parameters - it's arbitrary but usually either the input or output type. + template <typename perf_type> + static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os = {}) { + const PerformanceParameters params = strategy::template get_performance_parameters<perf_type>(args._ci); + // Note: Current hybrid kernels don't actually round up height (they // have paths for each possible height). Might need to make this // configurable in future. @@ -666,8 +820,23 @@ public: assert(parms.input_channels == _args._Ksize); _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms)); } + + GemmConfig get_config() override { + GemmConfig c; + + c.method = GemmMethod::GEMM_HYBRID; + c.inner_block_size = _k_block; + c.outer_block_size = _n_block; + c.filter = get_type_name<strategy>(); + c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To)); + + return c; + } }; +template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing> +using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>; + } // namespace arm_gemm #ifdef __I_DEFINED_UNUSED |