1 files changed, 148 insertions, 14 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 4f732f7d94..897ec9d05f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -29,7 +29,6 @@
 #include "arm_gemm.hpp"
 #include "bfloat.hpp"
 #include "convolver.hpp"
-#include "kernel_weight_format.hpp"
 #include "kernel_traits.hpp"
 #include "kernel_weight_format.hpp"
 #include "mergeresults.hpp"
@@ -191,10 +190,19 @@ void kernel_and_merge<false, false, Requantize32>::run(
     auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
 #endif
 
+    // Offset C pointer in a similar way to non-quantized case above.
+    Tri *offset_c_ptr;
+
+    if (c_ptr == nullptr) {
+        offset_c_ptr = nullptr;
+    } else {
+        offset_c_ptr = c_ptr + m_0 * ldc + n_0;
+    }
+
     strat.kernel(// A and B pointers are just the packed panels.
                  a_ptr, b_panel,
                  // Provide relevant part of output array and row stride.
-                 c_ptr + m_0 * ldc + n_0, ldc,
+                 offset_c_ptr, ldc,
                  // M, N, K sizes
                  m_max-m_0, n_max - n_0, kern_k,
                  // Bias, activation, accumulation.  Need to offset the bias as needed.
@@ -247,6 +255,84 @@ void kernel_and_merge<true, false, Requantize32>::run(
     }
 }
 
+// Run a kernel with integrated merge, dequantizing to FP32
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, false, DequantizeFloat>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+        unsigned int n_0, unsigned int n_max, const Tr *bias,
+        const Activation &act, bool accumulate, const DequantizeFloat &dq, const int32_t *col_bias,
+        Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+    auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+    const int32_t *offset_col_bias = nullptr;
+    const Tr *offset_bias = nullptr;
+
+    if (col_bias) {
+        offset_col_bias = col_bias + n_0;
+    }
+
+    if (bias) {
+        offset_bias = bias + n_0;
+    }
+
+    strat.kernel(// A and B pointers are just the packed panels.
+                 a_ptr, b_panel,
+                 // Provide relevant part of output array and row stride.
+                 c_ptr ? (c_ptr + m_0 * ldc + n_0) : nullptr, ldc,
+                 // M, N, K sizes
+                 m_max-m_0, n_max - n_0, kern_k,
+                 // Bias, activation, accumulation.  Need to offset the bias as needed.
+                 offset_col_bias, dq, offset_bias, act, accumulate, acc_buff);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, false, DequantizeFloat>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *bias,
+        const Activation &act, bool accumulate, const DequantizeFloat &qp, const int32_t *,
+        Tab *)
+{
+    const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        auto out_area = strategy::out_width() * strategy::out_height();
+        for (int i=0; i<bblocks; i++) {
+            const unsigned int n_start = n_0 + (strategy::out_width() * i);
+            const unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
+
+            dequantize_block_32(qp, (n_end - n_start), (m_max - m_0),
+                            c_panel + (i * out_area), strategy::out_width(),
+                            c_ptr + m_0 * ldc + n_start, ldc,
+                            bias != nullptr ? bias + n_start : nullptr, accumulate, act);
+
+        }
+    }
+}
+
 // Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
 // "requantizing" context where the output will be requantized.
 //
@@ -280,6 +366,12 @@ public:
     typedef int32_t type;
 };
 
+template<typename strategy>
+class accumulate_buffer_type<strategy, DequantizeFloat, false> {
+public:
+    typedef int32_t type;
+};
+
 template<typename strategy, typename OutputStage>
 class accumulate_buffer_type<strategy, OutputStage, true> {
 public:
@@ -350,6 +442,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const bool _thread_columns;
 
     const Activation _act;
+    const bool _accumulate;
 
     const int _maxthreads;
     int _nthreads;
@@ -579,15 +672,27 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
             return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
-        // K blocking not supported if we are requantizing.
-        if (std::is_same<OutputStage, Requantize32>::value) {
+        // K blocking not supported if we are requantizing with the merging
+        // kernels.
+        if (std::is_same<OutputStage, Requantize32>::value && MergeStep) {
             return get_ktotal(args);
         }
 
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+
         // Special blocking for SME
         if (is_sme<strategy>::value) {
-            // Don't bother to block below this size threshold, experimentally determined to be 320 for FP32
-            unsigned int scaling_threshold = 1280 / sizeof(Toi);
+            // Target 512 bytes for 64kB L1, or 1024 bytes for 128kB L1.
+            unsigned int target_bytes_per_block = L1_size / 128;
+
+            // Default cache size in gemm-linux is 32kB though - so make
+            // sure minimum is 512
+            if (target_bytes_per_block < 512) {
+                target_bytes_per_block = 512;
+            }
+
+            // Don't bother to block below this size threshold (1.25X target size)
+            unsigned int scaling_threshold = ((target_bytes_per_block * 5) / 4) / sizeof(Toi);
 
             if (get_ktotal(args) <= scaling_threshold) {
                 return get_ktotal(args);
@@ -595,7 +700,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
 
             // Once we are blocking, this (lower) threshold determines when we should use more blocks
             // NOTE: Could be that some factor-based solution would work better here.
-            unsigned int max_block_size = 1024 / sizeof(Toi);
+            unsigned int max_block_size = target_bytes_per_block / sizeof(Toi);
 
             unsigned int num_k_blocks = iceildiv(get_ktotal(args), max_block_size);
 
@@ -604,7 +709,6 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
             return k_block;
         }
 
-        const unsigned int L1_size = args._ci->get_L1_cache_size();
         unsigned int k_block;
 
         // k_block: Find out how much of the larger array can be loaded into half the cache.
@@ -639,6 +743,17 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
             return roundup(args._cfg->outer_block_size, strategy::out_width());
         }
 
+        // Special blocking for SME
+        if (is_sme<strategy>::value) {
+            // If total width is less than 4x kernel width, return the entire width.
+            if (args._Nsize < strategy::out_width()*4) {
+                return roundup(args._Nsize, strategy::out_width());
+            }
+
+            // Otherwise block to single kernel width.
+            return strategy::out_width();
+        }
+
         unsigned int x_block;
         const unsigned int L2_size = args._ci->get_L2_cache_size();
         const unsigned int k_block = get_k_block_size(args);
@@ -680,7 +795,7 @@ public:
                       _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
                       _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
                       _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _act(args._act), _accumulate(args._accumulate), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
                       _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
                       _os(os) { }
 
@@ -690,7 +805,7 @@ public:
                       _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
                       _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
                       _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _act(args._act), _accumulate(args._accumulate),  _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
                       _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
                       _os() { }
 
@@ -763,6 +878,9 @@ public:
                     const bool first_pass = (k0==0);
                     const bool last_pass  = (kmax==_Ktotal);
 
+                    // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass.
+                    const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass;
+
                     // Figure out how many "K" the kernel will actually process.
                     unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
 
@@ -821,9 +939,9 @@ public:
                             // K size, and M/N ranges
                             kern_k, start_row, end_row, start_x, end_x,
                             // Only do bias on the first pass
-                            ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
+                            ((bias_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
                             // Only do activation on the last pass, and accumulation on any non-first pass.
-                            (last_pass ? _act : Activation()), !first_pass,
+                            (last_pass ? _act : Activation()), (!first_pass || _accumulate),
                             // Pass in quantization parameters for requantizing kernels (others will ignore)
                             _os, col_bias + (multi * _Nsize),
                             // Accumulation buffer
@@ -948,6 +1066,9 @@ public:
                         const bool first_pass = (current.k0() == 0);
                         const bool last_pass  = (current.kmax() == _Ktotal);
 
+                        // Bias is passed for the first pass only, except for dequantizefloat nomerge cases where it's the last pass.
+                        const bool bias_pass = (std::is_same<OutputStage, DequantizeFloat>::value && !MergeStep) ? last_pass : first_pass;
+
                         // Pointer to appropriate part of result array.
                         Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
 
@@ -969,9 +1090,9 @@ public:
                             // K size, and M/N ranges
                             kern_k, y, ymax, current.x0(), current.xmax(),
                             // Only do bias on the first pass
-                            ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
+                            ((bias_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
                             // Only do activation on the last pass, and accumulation on any non-first pass.
-                            (last_pass ? _act : Activation()), !first_pass,
+                            (last_pass ? _act : Activation()), (!first_pass || _accumulate),
                             // Pass in quantization parameters for requantizing kernels (others will ignore)
                             _os, col_bias + (current.multi() * _Nsize),
                             // Accumulation buffer
@@ -1184,6 +1305,13 @@ public:
         }
     }
 
+    void set_dequantize_scale(const float scale) override {
+        if(std::is_same<OutputStage, DequantizeFloat>::value) {
+            DequantizeFloat* df = reinterpret_cast<DequantizeFloat *>(&_os);
+            df->scale = scale;
+        }
+    }
+
     void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
         assert(string_len == _Ksize);
         _indirect_buf = ptr;
@@ -1248,4 +1376,10 @@ using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strat
 template<typename strategy, typename To, typename Tr>
 using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
 
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedNoMergeDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedDequantized = GemmInterleaved<strategy, To, Tr, DequantizeFloat>;
+
 } // namespace arm_gemm