aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp347
1 files changed, 258 insertions, 89 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 41fecc6bec..0cc4d4f3d9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,9 @@
*/
#pragma once
+#if !defined(_WIN64) && !defined(__OpenBSD__)
#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
#include <algorithm>
#include <cassert>
@@ -31,6 +33,7 @@
#include "arm_gemm.hpp"
#include "bias_adder.hpp"
#include "convolver.hpp"
+#include "kernel_weight_format.hpp"
#include "ndrange.hpp"
#include "performance_parameters.hpp"
#include "transform.hpp"
@@ -52,34 +55,34 @@ namespace {
// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
// that.
-template<typename OutputStage, bool SeparateQuantize = false>
+template<typename OutputStage, bool SeparateQuantize, bool FixedFormat>
class run_hybrid_kernel {
public:
- template<typename strategy, typename To, typename Tr>
- static void run (
+ template<typename strategy, typename Tlo, typename Tro, typename Tr>
+ static inline void run (
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
};
template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Nothing, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false, false>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
const Nothing &, const int32_t *, unsigned int) {
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
#endif
UNUSED(kern_k);
- /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
+ /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
* a partial block and pad the bias for that block. */
if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
/* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
@@ -112,13 +115,61 @@ void run_hybrid_kernel<Nothing, false>::run(
}
template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false, true>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+ UNUSED(kern_k);
+
+ /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
+ * a partial block and pad the bias for that block. */
+ if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
+ /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
+ unsigned int N_remainder = N % strategy::out_width();
+ unsigned int N_bulk = N - N_remainder;
+
+ /* Output argument to be used for the tail */
+ IndirectOutputArg<Tr> offset_output = output_arg;
+
+ /* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */
+ if (N_bulk > 0) {
+ strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+
+ if (output_arg.is_indirect) {
+ offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
+ } else {
+ offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
+ }
+ }
+
+ /* Pad the bias buffer for the remainder */
+ Tr *bias_pad_buffer = reinterpret_cast<Tr *>(alloca(strategy::out_width() * sizeof(Tr)));
+ memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr));
+
+ /* Process the remainder, offsetting the B pointer as needed. */
+ strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder,
+ b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output,
+ bias_pad_buffer, act, accumulate);
+ } else {
+ strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+ }
+}
+
+template<>
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, false, false>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
@@ -129,13 +180,13 @@ void run_hybrid_kernel<Requantize32, false>::run(
}
template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, true>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, true, false>::run(
#ifdef CYCLE_PROFILING
profiler &prof,
#endif
- const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
- unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
UNUSED(kern_k);
// On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
@@ -178,12 +229,41 @@ void run_hybrid_kernel<Requantize32, true>::run(
}
}
+template<typename strategy, bool FixedFormat>
+struct stripe_width {
+ static unsigned int get() {
+ return strategy::stripe_width();
+ }
+};
+
+template<typename strategy>
+struct stripe_width<strategy, false> {
+ static unsigned int get() {
+ return 0;
+ }
+};
+
+template<typename strategy, bool FixedFormat>
+struct kernel_weight_format {
+ static KernelWeightFormat get() {
+ return strategy::kernel_weight_format();
+ }
+};
+
+template<typename strategy>
+struct kernel_weight_format<strategy, false> {
+ static KernelWeightFormat get() {
+ return KernelWeightFormat::NON_FIXED;
+ }
+};
+
} // anonymous namespace
// Implementation of the GemmCommon abstract class.
-template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false>
class GemmHybridIndirect : public GemmCommon<To, Tr> {
- typedef typename strategy::operand_type Toi;
+ typedef typename strategy::lhs_operand_type Tloi;
+ typedef typename strategy::rhs_operand_type Troi;
typedef typename strategy::result_type Tri;
GemmArgs _args;
@@ -201,7 +281,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
const unsigned int _Mround;
/* Pretransposed buffer. */
- const Toi *_B_transposed=nullptr;
+ const Troi *_B_transposed=nullptr;
/* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
const To * const * const * _indirect_buf = nullptr;
@@ -233,7 +313,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
}
if (args._cfg && args._cfg->inner_block_size) {
- return args._cfg->inner_block_size;
+ return roundup(args._cfg->inner_block_size, strategy::k_unroll());
}
// Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
@@ -356,11 +436,11 @@ public:
// In convolution mode, we need input pointers.
if (_convolver) {
- in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr);
- in_row_strings.resize(_args._Ksections, nullptr);
+ in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+ in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
for (unsigned int i=0; i<_args._Ksections; i++) {
- in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
+ in_row_strings[i] = &(in_row_ptrs.data()[i * strategy::out_height()]);
}
}
@@ -370,8 +450,8 @@ public:
}
/* Make sure we've been set up correctly. */
- assert(_B_transposed);
- static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+ assert(FixedFormat || _B_transposed);
+ static_assert(std::is_same<To, Tloi>::value, "gemm_native: Operand types must be the same.");
// static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
/* For now, each work item implies all the K for a given output
@@ -422,27 +502,35 @@ public:
const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize);
const unsigned int multi = p.dim(3);
- const Toi *b_panel = _B_transposed +
- (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
- (k0 * roundup(_args._Nsize, strategy::out_width())) +
- (n0 * kern_k);
+ const Troi *b_panel;
+ if (FixedFormat) {
+ b_panel = reinterpret_cast<const Troi *>(this->_Bptr) +
+ (multi * this->_B_multi_stride) +
+ ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+ (k0 * stripe_width<strategy, FixedFormat>::get());
+ } else {
+ b_panel = _B_transposed +
+ (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+ (k0 * roundup(_args._Nsize, strategy::out_width())) +
+ (n0 * kern_k);
+ }
- IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+ IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
#endif
if (_indirect_buf) {
- run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+ run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
#ifdef CYCLE_PROFILING
prof,
#endif
strat, sections, string_lengths.data(),
IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
- (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
last_pass ? _args._act : Activation(),
- !first_pass,
+ !first_pass || _args._accumulate,
// Quantization parameters
_os, _col_bias+(multi * _args._Nsize), n0);
} else if (_convolver) {
@@ -466,32 +554,32 @@ public:
}
assert(pos == sections);
- run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+ run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
#ifdef CYCLE_PROFILING
prof,
#endif
strat, sections, string_lengths.data(),
IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
- (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
last_pass ? _args._act : Activation(),
- !first_pass,
+ !first_pass || _args._accumulate,
// Quantization parameters
_os, _col_bias+(multi * _args._Nsize), n0);
} else {
// Length to process. This needs to exclude padding, but 'kmax' potentially includes it.
const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
- run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+ run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
#ifdef CYCLE_PROFILING
prof,
#endif
strat, 1, &len,
IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
- (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
(this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
last_pass ? _args._act : Activation(),
- !first_pass,
+ !first_pass || _args._accumulate,
// Quantization parameters
_os, _col_bias+(multi * _args._Nsize), n0);
}
@@ -501,16 +589,20 @@ public:
// Interface implementation - pretransposed
bool B_is_pretransposed() const override {
- return true;
+ return (FixedFormat == false);
}
bool B_pretranspose_required() const override {
- return (_B_transposed==nullptr);
+ return (FixedFormat == false) && (_B_transposed==nullptr);
}
size_t get_B_pretransposed_array_size() const override {
+ if (FixedFormat) {
+ return 0;
+ }
+
// Start with actual pretransposed buffer...
- size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+ size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi);
// Space for result row pointers (not strictly needed any more but retained for indirect output testing)
size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
@@ -522,7 +614,11 @@ public:
return size;
}
- void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ size_t get_B_pretranspose_window_size() const override {
+ return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width());
+ }
+
+ void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
if (std::is_same<OutputStage, Requantize32>::value) {
_col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -533,62 +629,115 @@ public:
compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
}
}
+ }
+
+ bool B_pretranspose_supports_transpose() const override {
+ strategy strat(_args._ci);
+ return strat.transforms.PrepareB_supports_transpose();
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+ pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
+ }
+
+ void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override {
+ if (end >= get_B_pretranspose_window_size()) {
+ requantize_bias(in_buffer, B, ldb, B_multi_stride);
+ }
// Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
- Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
- _B_transposed = buffer;
+ Troi *buffer_base = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
+ _B_transposed = buffer_base;
strategy strat(_args._ci);
+ size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width());
+
+ for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) {
+ // Work out which part of the window space this multi occupies,
+ // skip to the next multi or exit as needed.
+ size_t wk_start = multi * work_per_multi;
+ size_t wk_end = (multi + 1) * work_per_multi;
+
+ assert(wk_end > start);
+
+ if (wk_start >= end) {
+ break;
+ }
- for (unsigned int multi=0; multi<_args._nmulti; multi++) {
for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
/* Figure out the size of each block. */
unsigned int k_size = kmax - k0;
- // We need to insert padding at the end of each K section.
- // The computation needed is a little delicate - the coordinates from the block walker are expressed in
- // terms of the full, padded, _Ktotal.
- // But we need to transform each section with reference to the original, unpadded, input, letting the
- // transform pad each section as needed.
-
- // This is needed for computations below.
- const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
-
- // The expected output format is also an entire <out_width> columns interleaved, then the next set of
- // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
- // a time.
- for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
- unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
-
- // Track where we are and how much work is left.
- unsigned int kpos = k0;
- unsigned int kleft = k_size;
-
- while (kleft) {
- // Which section are we in? Based on the rounded-up section size.
- unsigned int k_section_base = kpos / rounded_section_size;
- // How far into the section are we?
- unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
-
- // We will either copy the rest of this section, or to the end of the requested length.
- unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
-
- strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
- x0, xmax,
- (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
- (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
+ // Correct the N range and buffer base if we are not processing the whole block.
+ size_t n_start = 0;
+ size_t n_end = _args._Nsize;
- // We need to modify our position based on the ROUNDED version of what we just did.
- unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+ // If we are not doing the first columns, update the buffer write position and starting N value.
+ if (start > wk_start) {
+ n_start = (start - wk_start) * strategy::out_width();
+ }
- buffer += strategy::out_width() * padded_length;
+ // If we are not doing the last items, update the final N value.
+ if (end < wk_end) {
+ n_end = (end - wk_start) * strategy::out_width();
+ }
- kpos += padded_length;
- kleft -= padded_length;
+ // Set the buffer pointer
+ Troi *buffer = buffer_base +
+ (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) +
+ (n_start * roundup(k_size, strategy::k_unroll()));
+
+ if (_args._Ksections > 1) {
+ // We need to insert padding at the end of each K section.
+ // The computation needed is a little delicate - the k0/kmax coordinates are expressed in
+ // terms of the full, padded, _Ktotal.
+ // But we need to transform each section with reference to the original, unpadded, input, letting the
+ // transform pad each section as needed.
+
+ // This is needed for computations below.
+ const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+
+ // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+ // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
+ // a time.
+ for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) {
+ unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+
+ // Track where we are and how much work is left.
+ unsigned int kpos = k0;
+ unsigned int kleft = k_size;
+
+ while (kleft) {
+ // Which section are we in? Based on the rounded-up section size.
+ unsigned int k_section_base = kpos / rounded_section_size;
+ // How far into the section are we?
+ unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+ // We will either copy the rest of this section, or to the end of the requested length.
+ unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+
+ strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+ x0, xmax,
+ (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
+ (k_section_base * _args._Ksize) + k_offset + k_length, // K end point - starting point plus length computed above.
+ transposed);
+
+ // We need to modify our position based on the ROUNDED version of what we just did.
+ unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+
+ buffer += strategy::out_width() * padded_length;
+
+ kpos += padded_length;
+ kleft -= padded_length;
+ }
}
+ } else {
+ // In the single K section case, can process the whole lot in one go.
+ strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+ n_start, n_end, k0, std::min(kmax, _args._Ksize), transposed);
}
}
}
@@ -597,12 +746,17 @@ public:
void set_pretransposed_B_data(void *in_buffer) override {
// Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
- _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
_col_bias = reinterpret_cast<int32_t *>(in_buffer);
}
- // Estimate cycles for given problem given provided parameters
- static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params, const OutputStage &os = {} ) {
+ // Estimate cycles for given problem given provided parameters.
+ // "perf_type" is a type to pass along to get_performance_parameters to get the right set of performance
+ // parameters - it's arbitrary but usually either the input or output type.
+ template <typename perf_type>
+ static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os = {}) {
+ const PerformanceParameters params = strategy::template get_performance_parameters<perf_type>(args._ci);
+
// Note: Current hybrid kernels don't actually round up height (they
// have paths for each possible height). Might need to make this
// configurable in future.
@@ -666,8 +820,23 @@ public:
assert(parms.input_channels == _args._Ksize);
_convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
}
+
+ GemmConfig get_config() override {
+ GemmConfig c;
+
+ c.method = GemmMethod::GEMM_HYBRID;
+ c.inner_block_size = _k_block;
+ c.outer_block_size = _n_block;
+ c.filter = get_type_name<strategy>();
+ c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To));
+
+ return c;
+ }
};
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>;
+
} // namespace arm_gemm
#ifdef __I_DEFINED_UNUSED