1 files changed, 258 insertions, 89 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 41fecc6bec..0cc4d4f3d9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,9 @@
  */
 #pragma once
 
+#if !defined(_WIN64) && !defined(__OpenBSD__)
 #include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
 
 #include <algorithm>
 #include <cassert>
@@ -31,6 +33,7 @@
 #include "arm_gemm.hpp"
 #include "bias_adder.hpp"
 #include "convolver.hpp"
+#include "kernel_weight_format.hpp"
 #include "ndrange.hpp"
 #include "performance_parameters.hpp"
 #include "transform.hpp"
@@ -52,34 +55,34 @@ namespace {
 // We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
 // that.
 
-template<typename OutputStage, bool SeparateQuantize = false>
+template<typename OutputStage, bool SeparateQuantize, bool FixedFormat>
 class run_hybrid_kernel {
 public:
-    template<typename strategy, typename To, typename Tr>
-    static void run (
+    template<typename strategy, typename Tlo, typename Tro, typename Tr>
+    static inline void run (
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
         const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
 };
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Nothing, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false, false>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
         const Nothing &, const int32_t *, unsigned int) {
 #ifdef CYCLE_PROFILING
     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
 #endif
     UNUSED(kern_k);
 
-    /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
+    /* Indirect hybrid kernels read the full width of the bias.  So we need to detect the case where we are writing
      * a partial block and pad the bias for that block. */
     if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
         /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
@@ -112,13 +115,61 @@ void run_hybrid_kernel<Nothing, false>::run(
 }
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false, true>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t b_stride, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+    UNUSED(kern_k);
+
+    /* Indirect hybrid kernels read the full width of the bias.  So we need to detect the case where we are writing
+     * a partial block and pad the bias for that block. */
+    if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
+        /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
+        unsigned int N_remainder = N % strategy::out_width();
+        unsigned int N_bulk = N - N_remainder;
+
+        /* Output argument to be used for the tail */
+        IndirectOutputArg<Tr> offset_output = output_arg;
+
+        /* If there is a "bulk" to be processed, handle that and update "offset_output" appropriately. */
+        if (N_bulk > 0) {
+            strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+
+            if (output_arg.is_indirect) {
+                offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
+            } else {
+                offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
+            }
+        }
+
+        /* Pad the bias buffer for the remainder */
+        Tr *bias_pad_buffer = reinterpret_cast<Tr *>(alloca(strategy::out_width() * sizeof(Tr)));
+        memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder * sizeof(Tr));
+
+        /* Process the remainder, offsetting the B pointer as needed. */
+        strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder,
+                     b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output,
+                     bias_pad_buffer, act, accumulate);
+    } else {
+        strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
+    }
+}
+
+template<>
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, false, false>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
 #ifdef CYCLE_PROFILING
     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
@@ -129,13 +180,13 @@ void run_hybrid_kernel<Requantize32, false>::run(
 }
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, true>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, true, false>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, size_t, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
     UNUSED(kern_k);
     // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
@@ -178,12 +229,41 @@ void run_hybrid_kernel<Requantize32, true>::run(
     }
 }
 
+template<typename strategy, bool FixedFormat>
+struct stripe_width {
+    static unsigned int get() {
+        return strategy::stripe_width();
+    }
+};
+
+template<typename strategy>
+struct stripe_width<strategy, false> {
+    static unsigned int get() {
+        return 0;
+    }
+};
+
+template<typename strategy, bool FixedFormat>
+struct kernel_weight_format {
+    static KernelWeightFormat get() {
+        return strategy::kernel_weight_format();
+    }
+};
+
+template<typename strategy>
+struct kernel_weight_format<strategy, false> {
+    static KernelWeightFormat get() {
+        return KernelWeightFormat::NON_FIXED;
+    }
+};
+
 } // anonymous namespace
 
 // Implementation of the GemmCommon abstract class.
-template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool SeparateQuantize=false, bool FixedFormat=false>
 class GemmHybridIndirect : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::lhs_operand_type Tloi;
+    typedef typename strategy::rhs_operand_type Troi;
     typedef typename strategy::result_type Tri;
 
     GemmArgs           _args;
@@ -201,7 +281,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
     const unsigned int _Mround;
 
     /* Pretransposed buffer. */
-    const Toi *_B_transposed=nullptr;
+    const Troi *_B_transposed=nullptr;
 
     /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
     const To * const * const * _indirect_buf = nullptr;
@@ -233,7 +313,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
         }
 
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
         // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
@@ -356,11 +436,11 @@ public:
 
         // In convolution mode, we need input pointers.
         if (_convolver) {
-            in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr);
-            in_row_strings.resize(_args._Ksections, nullptr);
+            in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+            in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
 
             for (unsigned int i=0; i<_args._Ksections; i++) {
-                in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
+                in_row_strings[i] = &(in_row_ptrs.data()[i * strategy::out_height()]);
             }
         }
 
@@ -370,8 +450,8 @@ public:
         }
 
         /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
-        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+        assert(FixedFormat || _B_transposed);
+        static_assert(std::is_same<To, Tloi>::value, "gemm_native: Operand types must be the same.");
 //        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
 
         /* For now, each work item implies all the K for a given output
@@ -422,27 +502,35 @@ public:
                 const unsigned int nmax    = std::min(n0 + _n_block, _args._Nsize);
                 const unsigned int multi   = p.dim(3);
 
-                const Toi *b_panel = _B_transposed +
-                                     (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
-                                     (k0 * roundup(_args._Nsize, strategy::out_width())) +
-                                     (n0 * kern_k);
+                const Troi *b_panel;
+                if (FixedFormat) {
+                    b_panel = reinterpret_cast<const Troi *>(this->_Bptr) +
+                               (multi * this->_B_multi_stride) +
+                               ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+                               (k0 * stripe_width<strategy, FixedFormat>::get());
+                } else {
+                    b_panel = _B_transposed +
+                               (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+                               (k0 * roundup(_args._Nsize, strategy::out_width())) +
+                               (n0 * kern_k);
+                }
 
-               IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+                IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
 
 #ifdef CYCLE_PROFILING
                 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
 #endif
                 if (_indirect_buf) {
-                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+                    run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
 #ifdef CYCLE_PROFILING
                                  prof,
 #endif
                                  strat, sections, string_lengths.data(),
                                  IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
-                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                                  last_pass ? _args._act : Activation(),
-                                 !first_pass,
+                                 !first_pass || _args._accumulate,
                                  // Quantization parameters
                                  _os, _col_bias+(multi * _args._Nsize), n0);
                 } else if (_convolver) {
@@ -466,32 +554,32 @@ public:
                     }
                     assert(pos == sections);
 
-                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+                    run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
 #ifdef CYCLE_PROFILING
                                  prof,
 #endif
                                  strat, sections, string_lengths.data(),
                                  IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
-                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                                  last_pass ? _args._act : Activation(),
-                                 !first_pass,
+                                 !first_pass || _args._accumulate,
                                  // Quantization parameters
                                  _os, _col_bias+(multi * _args._Nsize), n0);
                 } else {
                     // Length to process.  This needs to exclude padding, but 'kmax' potentially includes it.
                     const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
 
-                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+                    run_hybrid_kernel<OutputStage, SeparateQuantize, FixedFormat>::run(
 #ifdef CYCLE_PROFILING
                                  prof,
 #endif
                                  strat, 1, &len,
                                  IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
-                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                                  last_pass ? _args._act : Activation(),
-                                 !first_pass,
+                                 !first_pass || _args._accumulate,
                                  // Quantization parameters
                                  _os, _col_bias+(multi * _args._Nsize), n0);
                 }
@@ -501,16 +589,20 @@ public:
 
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
-        return true;
+        return (FixedFormat == false);
     }
 
     bool B_pretranspose_required() const override {
-        return (_B_transposed==nullptr);
+        return (FixedFormat == false) && (_B_transposed==nullptr);
     }
 
     size_t get_B_pretransposed_array_size() const override {
+        if (FixedFormat) {
+            return 0;
+        }
+
         // Start with actual pretransposed buffer...
-        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi);
 
         // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
         size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
@@ -522,7 +614,11 @@ public:
         return size;
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+    size_t get_B_pretranspose_window_size() const override {
+        return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width());
+    }
+
+    void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         if (std::is_same<OutputStage, Requantize32>::value) {
             _col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
@@ -533,62 +629,115 @@ public:
                 compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
             }
         }
+    }
+
+    bool B_pretranspose_supports_transpose() const override {
+        strategy strat(_args._ci);
+        return strat.transforms.PrepareB_supports_transpose();
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed) override {
+        pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, transposed, 0, get_B_pretranspose_window_size());
+    }
+
+    void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, bool transposed, size_t start, size_t end) override {
+        if (end >= get_B_pretranspose_window_size()) {
+            requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        }
 
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
-        _B_transposed = buffer;
+        Troi *buffer_base = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer_base;
 
         strategy strat(_args._ci);
+        size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width());
+
+        for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) {
+            // Work out which part of the window space this multi occupies,
+            // skip to the next multi or exit as needed.
+            size_t wk_start = multi * work_per_multi;
+            size_t wk_end = (multi + 1) * work_per_multi;
+
+            assert(wk_end > start);
+
+            if (wk_start >= end) {
+                break;
+            }
 
-        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
             for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
                 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
 
                 /* Figure out the size of each block. */
                 unsigned int k_size = kmax - k0;
 
-                // We need to insert padding at the end of each K section.
-                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
-                // terms of the full, padded, _Ktotal.
-                // But we need to transform each section with reference to the original, unpadded, input, letting the
-                // transform pad each section as needed.
-
-                // This is needed for computations below.
-                const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
-
-                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
-                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
-                // a time.
-                for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
-                    unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
-
-                    // Track where we are and how much work is left.
-                    unsigned int kpos  = k0;
-                    unsigned int kleft = k_size;
-
-                    while (kleft) {
-                        // Which section are we in?  Based on the rounded-up section size.
-                        unsigned int k_section_base = kpos / rounded_section_size;
-                        // How far into the section are we?
-                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
-
-                        // We will either copy the rest of this section, or to the end of the requested length.
-                        unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
-
-                        strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
-                                                  x0, xmax,
-                                                  (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
-                                                  (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+                // Correct the N range and buffer base if we are not processing the whole block.
+                size_t n_start = 0;
+                size_t n_end = _args._Nsize;
 
-                        // We need to modify our position based on the ROUNDED version of what we just did.
-                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+                // If we are not doing the first columns, update the buffer write position and starting N value.
+                if (start > wk_start) {
+                    n_start = (start - wk_start) * strategy::out_width();
+                }
 
-                        buffer += strategy::out_width() * padded_length;
+                // If we are not doing the last items, update the final N value.
+                if (end < wk_end) {
+                    n_end = (end - wk_start) * strategy::out_width();
+                }
 
-                        kpos  += padded_length;
-                        kleft -= padded_length;
+                // Set the buffer pointer
+                Troi *buffer = buffer_base +
+                               (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) +
+                               (n_start * roundup(k_size, strategy::k_unroll()));
+
+                if (_args._Ksections > 1) {
+                    // We need to insert padding at the end of each K section.
+                    // The computation needed is a little delicate - the k0/kmax coordinates are expressed in
+                    // terms of the full, padded, _Ktotal.
+                    // But we need to transform each section with reference to the original, unpadded, input, letting the
+                    // transform pad each section as needed.
+
+                    // This is needed for computations below.
+                    const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+
+                    // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                    // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                    // a time.
+                    for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) {
+                        unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+
+                        // Track where we are and how much work is left.
+                        unsigned int kpos  = k0;
+                        unsigned int kleft = k_size;
+
+                        while (kleft) {
+                            // Which section are we in?  Based on the rounded-up section size.
+                            unsigned int k_section_base = kpos / rounded_section_size;
+                            // How far into the section are we?
+                            unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+                            // We will either copy the rest of this section, or to the end of the requested length.
+                            unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+
+                            strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                                      x0, xmax,
+                                                      (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                      (k_section_base * _args._Ksize) + k_offset + k_length,    // K end point - starting point plus length computed above.
+                                                      transposed);
+
+                            // We need to modify our position based on the ROUNDED version of what we just did.
+                            unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+
+                            buffer += strategy::out_width() * padded_length;
+
+                            kpos  += padded_length;
+                            kleft -= padded_length;
+                        }
                     }
+                } else {
+                    // In the single K section case, can process the whole lot in one go.
+                    strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                              n_start, n_end, k0, std::min(kmax, _args._Ksize), transposed);
                 }
             }
         }
@@ -597,12 +746,17 @@ public:
     void set_pretransposed_B_data(void *in_buffer) override {
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
         _col_bias = reinterpret_cast<int32_t *>(in_buffer);
     }
 
-    // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params, const OutputStage &os = {} ) {
+    // Estimate cycles for given problem given provided parameters.
+    // "perf_type" is a type to pass along to get_performance_parameters to get the right set of performance
+    // parameters - it's arbitrary but usually either the input or output type.
+    template <typename perf_type>
+    static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os = {}) {
+        const PerformanceParameters params = strategy::template get_performance_parameters<perf_type>(args._ci);
+
         // Note: Current hybrid kernels don't actually round up height (they
         // have paths for each possible height).  Might need to make this
         // configurable in future.
@@ -666,8 +820,23 @@ public:
         assert(parms.input_channels == _args._Ksize);
         _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+        c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To));
+
+        return c;
+    }
 };
 
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmHybridIndirectFixedFormat = GemmHybridIndirect<strategy, To, Tr, OutputStage, false, true>;
+
 } // namespace arm_gemm
 
 #ifdef __I_DEFINED_UNUSED