[arm_gemm] Import fixed-format kernels from gemm_linux.

This is a No Functional Change Intended (NFCI) patch. It imports the kernel in the code, but the interface to select them and expose the format of the weight tensors to the user will be provided in a subsequent patch. Kernels and kernel selection code in arm_gemm has been provided by David.Mansell <David.Mansell@arm.com>. The kernels are not compiled in the library by default, but need to be selected via the `scons` option `experimental_fixed_format_kernels=1`. Resolves: ONCPUML-829 Signed-off-by: Francesco.Petrogalli@arm.com <francesco.petrogalli@arm.com> Change-Id: If00ccb2b9b7221e01b214cf9783111226ccc8bf4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7380 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Francesco.Petrogalli@arm.com <francesco.petrogalli@arm.com> 2022-04-05 10:31:08 +0000
committer: Francesco Petrogalli <francesco.petrogalli@arm.com> 2022-05-24 14:28:27 +0000
commit: 5fcf22dadf092efd7aafb359f9229aa270eb1129 (patch)
tree: f309426ed19bd6710329da3b530167db72d1c6b2 /src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
parent: a8caa023f0d7b71b3a250a14ceee935052fcc74a (diff)
download: ComputeLibrary-5fcf22dadf092efd7aafb359f9229aa270eb1129.tar.gz
1 files changed, 118 insertions, 25 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index c75c320a6b..4ad54426e9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,9 @@
 #include <cassert>
 
 #include "arm_gemm.hpp"
+#include "bfloat.hpp"
 #include "convolver.hpp"
+#include "kernel_weight_format.hpp"
 #include "mergeresults.hpp"
 #include "performance_parameters.hpp"
 #include "quantized.hpp"
@@ -56,7 +58,7 @@ namespace {
 // Others output directly to the matrix result.  This helper class calls the
 // appropriate functions, using templating to avoid calling non-existent
 // functions.
-template<bool MergeStep, typename OutputStage>
+template<bool MergeStep, bool FixedFormat, typename OutputStage>
 class kernel_and_merge {
 public:
     template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
@@ -64,7 +66,7 @@ public:
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
         const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
@@ -74,11 +76,11 @@ public:
 // Run a kernel and call the separate merge step
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<true, Nothing>::run(
+void kernel_and_merge<true, false, Nothing>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
         const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
@@ -101,14 +103,44 @@ void kernel_and_merge<true, Nothing>::run(
     }
 }
 
+// Run a fixed-format kernel and call the separate merge step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, true, Nothing>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t b_stride, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
+{
+    {
+#ifdef CYCLE_PROFILING
+        const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, b_stride, c_panel, 1, (n_max - n_0), kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
+    }
+}
+
 // Run a kernel with integrated merge
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<false, Nothing>::run(
+void kernel_and_merge<false, false, Nothing>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
         unsigned int n_0, unsigned int n_max, const Tr *biasptr,
         const Activation &act, bool accumulate, const Nothing &, const int32_t *,
@@ -143,11 +175,11 @@ void kernel_and_merge<false, Nothing>::run(
 // Run a kernel with integrated merge, quantizing
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<false, Requantize32>::run(
+void kernel_and_merge<false, false, Requantize32>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
         unsigned int n_0, unsigned int n_max, const Tr *,
         const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
@@ -170,11 +202,11 @@ void kernel_and_merge<false, Requantize32>::run(
 // Run a kernel and call the separate quantize step
 template<>
 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
-void kernel_and_merge<true, Requantize32>::run(
+void kernel_and_merge<true, false, Requantize32>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        strategy &strat, const To *a_ptr, const To *b_panel, size_t, Tri *c_panel,
         Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
         unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
         const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
@@ -246,9 +278,49 @@ public:
     typedef int32_t type;
 };
 
+// Stripe width is a concept only needed for FixedFormat kernels.  Use an accessor to avoid issues in other scenarios.
+template<typename strategy, bool FixedFormat>
+struct get_stripe_width {
+    static unsigned int get() {
+        return 0;
+    }
+};
+
+template<typename strategy>
+struct get_stripe_width<strategy, true> {
+    static unsigned int get() {
+        return strategy::stripe_width();
+    }
+};
+
+// KernelWeightFormat is a similar story.
+template<typename strategy, bool FixedFormat, typename To>
+struct get_kernel_weight_format {
+    static KernelWeightFormat get() {
+        return KernelWeightFormat::NON_FIXED;
+    }
+};
+
+template<typename strategy, typename To>
+struct get_kernel_weight_format<strategy, true, To> {
+    static KernelWeightFormat get() {
+        KernelWeightFormat kwf = strategy::kernel_weight_format();
+
+        // If we are using a BF16 kernel to do an FP32 problem (fast mode) then we need to set the BF16 flag on the
+        // weight format.
+        if (std::is_same<To, float>::value && std::is_same<typename strategy::operand_type, bfloat16>::value) {
+            uint32_t kwf_i = static_cast<uint32_t>(kwf);
+            kwf_i |= 0x10;
+            kwf = static_cast<KernelWeightFormat>(kwf_i);
+        }
+
+        return kwf;
+    }
+};
+
 } // anonymous namespace
 
-template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool FixedFormat=false, bool ForceThreadColumns=false>
 class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
@@ -310,7 +382,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent;
+        const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &_parent;
 
         /* K, X and multi parameters for current iteration. */
         unsigned int _k0=0, _x0=0, _multi=0;
@@ -325,9 +397,9 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         bool _newmulti=true;
 
     public:
-        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &parent) : _parent(parent) { }
 
-        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent,
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, FixedFormat, ForceThreadColumns> &parent,
                     unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
 
         unsigned int xmax() {
@@ -666,7 +738,11 @@ public:
                     // Figure out how many "K" the kernel will actually process.
                     unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
 
-                    const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
+                    const Toi *b_ptr = FixedFormat ?
+                        reinterpret_cast<const Toi *>(this->_Bptr) + (multi * this->_B_multi_stride) +
+                                                     ((start_x / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+                                                     (k0 * get_stripe_width<strategy, FixedFormat>::get()) :
+                        _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
 
                     unsigned int batch     = batch_0;
                     unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
@@ -699,12 +775,12 @@ public:
                         }
 
                         // Perform the kernel and merge step, either separately or together as required.
-                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
                         #ifdef CYCLE_PROFILING
                             prof,
                         #endif
                             // Strategy and panel pointers
-                            strat, a_panel, b_ptr, c_panel,
+                            strat, a_panel, b_ptr, this->_ldb, c_panel,
                             // Result buffer pointers
                             this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
                             // K size, and M/N ranges
@@ -802,6 +878,13 @@ public:
                     }
                 }
 
+                // For FixedFormat cases, figure out the B pointer.  The loop below moves through batches and vertically through the output so this will be the same throughout.
+                if (FixedFormat) {
+                    b_panel = reinterpret_cast<const Toi *>(this->_Bptr) + (current.multi() * this->_B_multi_stride) +
+                                                                           ((current.x0() / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
+                                                                           (current.k0() * get_stripe_width<strategy, FixedFormat>::get());
+                }
+
                 /* Do the actual work. */
                 for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
                     unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
@@ -840,12 +923,12 @@ public:
                         }
 
                         // Perform the kernel and merge step, either separately or together as required.
-                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        kernel_and_merge<MergeStep, FixedFormat, OutputStage>::run(
                         #ifdef CYCLE_PROFILING
                             prof,
                         #endif
                             // Strategy and panel pointers
-                            strat, a_ptr, b_panel, c_panel,
+                            strat, a_ptr, b_panel, this->_ldb, c_panel,
                             // Result buffer pointers
                             result_ptr, this->_ldc,
                             // K size, and M/N ranges
@@ -863,7 +946,9 @@ public:
                     }
                 }
 
-                b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+                if (FixedFormat == false) {
+                    b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+                }
             }
         }
     }
@@ -910,14 +995,18 @@ public:
 
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
-        return true;
+        return (FixedFormat == false);
     }
 
     bool B_pretranspose_required() const override {
-        return (_B_transposed==nullptr);
+        return (FixedFormat == false) && (_B_transposed==nullptr);
     }
 
     size_t get_B_pretransposed_array_size() const override {
+        if (FixedFormat) {
+            return 0;
+        }
+
         unsigned int x_size = roundup(_Nsize, strategy::out_width());
 
         return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
@@ -939,7 +1028,7 @@ public:
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         requantize_bias(in_buffer, B, ldb, B_multi_stride);
 
-        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
         Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
         _B_transposed = buffer;
@@ -1005,7 +1094,7 @@ public:
     }
 
     void set_pretransposed_B_data(void *in_buffer) override {
-        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
         _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
         col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -1065,6 +1154,7 @@ public:
         c.inner_block_size = _k_block;
         c.outer_block_size = _x_block;
         c.filter = get_type_name<strategy>();
+        c.weight_format = get_weight_format(get_kernel_weight_format<strategy, FixedFormat, To>::get(), sizeof(To));
 
         return c;
     }
@@ -1074,6 +1164,9 @@ public:
 template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
 using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
 
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmInterleavedFixedFormat = GemmInterleaved<strategy, To, Tr, OutputStage, true, true>;
+
 template<typename strategy, typename To, typename Tr>
 using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
author	Francesco.Petrogalli@arm.com <francesco.petrogalli@arm.com>	2022-04-05 10:31:08 +0000
committer	Francesco Petrogalli <francesco.petrogalli@arm.com>	2022-05-24 14:28:27 +0000
commit	5fcf22dadf092efd7aafb359f9229aa270eb1129 (patch)
tree	f309426ed19bd6710329da3b530167db72d1c6b2 /src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
parent	a8caa023f0d7b71b3a250a14ceee935052fcc74a (diff)
download	ComputeLibrary-5fcf22dadf092efd7aafb359f9229aa270eb1129.tar.gz