From dba672cec878966e465bb476e896c8f75bbd9145 Mon Sep 17 00:00:00 2001
From: SiCong Li <sicong.li@arm.com>
Date: Thu, 6 Apr 2023 16:30:18 +0100
Subject: Integrate multi-threaded pretranspose_B_array

This is required for the case where rhs (B) is dynamic and needs to be
pretransposed in every run.

In a multi-threaded setting, this means the previously single-threaded
pretranspose_B_array would become the bottleneck

Resolves COMPMID-5896

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Id508c46992188a0f76a505152931d4955d04c16d
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9455
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/IScheduler.h                   |  6 +--
 .../NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp | 58 ++++++++++++++++++----
 .../NEON/kernels/arm_gemm/gemm_interleaved.hpp     | 39 +++++++++++++--
 src/cpu/kernels/assembly/gemm_common.hpp           | 24 ++++++++-
 .../operators/internal/CpuGemmAssemblyDispatch.cpp | 50 +++++++++++++++++--
 src/runtime/IScheduler.cpp                         |  6 +--
 6 files changed, 159 insertions(+), 24 deletions(-)
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 129009c58d..df5a44001f 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -176,9 +176,9 @@ public:
 
     /** Execute all the passed workloads
      *
-     * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
+     * @note There is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
      *
-     * @param[in] workloads Array of workloads to run
+     * @param[in] workloads List of workloads to run
      * @param[in] tag       String that can be used by profiling tools to identify the workloads run by the scheduler (Can be null).
      */
     virtual void run_tagged_workloads(std::vector<Workload> &workloads, const char *tag);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 90e2f07607..0bbcd10b66 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -614,6 +614,10 @@ public:
         return size;
     }
 
+    size_t get_B_pretranspose_window_size() const override {
+        return _args._nmulti * iceildiv(_args._Nsize, strategy::out_width());
+    }
+
     void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         if (std::is_same<OutputStage, Requantize32>::value) {
             _col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -628,25 +632,62 @@ public:
     }
 
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size());
+    }
+
+    void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override {
+        if (end >= get_B_pretranspose_window_size()) {
+            requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        }
 
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        Troi *buffer = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
-        _B_transposed = buffer;
+        Troi *buffer_base = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer_base;
 
         strategy strat(_args._ci);
+        size_t work_per_multi = iceildiv(_args._Nsize, strategy::out_width());
+
+        for (unsigned int multi=(start / work_per_multi); multi<_args._nmulti; multi++) {
+            // Work out which part of the window space this multi occupies,
+            // skip to the next multi or exit as needed.
+            size_t wk_start = multi * work_per_multi;
+            size_t wk_end = (multi + 1) * work_per_multi;
+
+            assert(wk_end > start);
+
+            if (wk_start >= end) {
+                break;
+            }
 
-        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
             for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
                 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
 
                 /* Figure out the size of each block. */
                 unsigned int k_size = kmax - k0;
 
+                // Correct the N range and buffer base if we are not processing the whole block.
+                size_t n_start = 0;
+                size_t n_end = _args._Nsize;
+
+                // If we are not doing the first columns, update the buffer write position and starting N value.
+                if (start > wk_start) {
+                    n_start = (start - wk_start) * strategy::out_width();
+                }
+
+                // If we are not doing the last items, update the final N value.
+                if (end < wk_end) {
+                    n_end = (end - wk_start) * strategy::out_width();
+                }
+
+                // Set the buffer pointer
+                Troi *buffer = buffer_base +
+                               (roundup(_args._Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) +
+                               (n_start * roundup(k_size, strategy::k_unroll()));
+
                 if (_args._Ksections > 1) {
                     // We need to insert padding at the end of each K section.
-                    // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+                    // The computation needed is a little delicate - the k0/kmax coordinates are expressed in
                     // terms of the full, padded, _Ktotal.
                     // But we need to transform each section with reference to the original, unpadded, input, letting the
                     // transform pad each section as needed.
@@ -657,7 +698,7 @@ public:
                     // The expected output format is also an entire <out_width> columns interleaved, then the next set of
                     // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
                     // a time.
-                    for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
+                    for (unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) {
                         unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
 
                         // Track where we are and how much work is left.
@@ -690,8 +731,7 @@ public:
                 } else {
                     // In the single K section case, can process the whole lot in one go.
                     strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
-                                              0, _args._Nsize, k0, std::min(kmax, _args._Ksize));
-                    buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll());
+                                              n_start, n_end, k0, std::min(kmax, _args._Ksize));
                 }
             }
         }
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index c2fd0b0e8c..13f548e39e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -31,6 +31,7 @@
 #include "convolver.hpp"
 #include "kernel_weight_format.hpp"
 #include "kernel_traits.hpp"
+#include "kernel_weight_format.hpp"
 #include "mergeresults.hpp"
 #include "performance_parameters.hpp"
 #include "quantized.hpp"
@@ -1039,6 +1040,13 @@ public:
         return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
     }
 
+    size_t get_B_pretranspose_window_size() const override {
+        size_t n_blocks = iceildiv(_Nsize, _x_block);
+        size_t k_blocks = iceildiv(_Ktotal, _k_block);
+
+        return n_blocks * k_blocks * _nmulti;
+    }
+
     void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         if (std::is_same<OutputStage, Requantize32>::value) {
             col_bias = reinterpret_cast<int32_t *>(in_buffer);
@@ -1053,7 +1061,14 @@ public:
     }
 
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size());
+    }
+
+    void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override {
+        // Perform column sums etc as part of the last block.
+        if (end >= get_B_pretranspose_window_size()) {
+            requantize_bias(in_buffer, B, ldb, B_multi_stride);
+        }
 
         // Put the transposed data after the column sums - in non-quantized cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
@@ -1063,7 +1078,20 @@ public:
         blockwalker current(*this);
         strategy strat(_ci);
 
-        do {
+        // Skip over blocks we aren't doing
+        for(size_t i = 0; i < start; i++) {
+            buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
+            current.advance();
+        }
+
+        size_t blocks_left = (end - start);
+
+        // Double check that we haven't run out of work
+        if (current.done()) {
+            blocks_left = 0;
+        }
+
+        for (/* blocks_left initialized above */; blocks_left > 0; blocks_left--) {
             /* Figure out the size of each block. */
             unsigned int k_size = (current.kmax() - current.k0());
 
@@ -1117,7 +1145,12 @@ public:
                                           current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize));
                 buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
             }
-        } while (current.advance());
+
+            // Advance to the next block, break if we run off the end.
+            if (!current.advance()) {
+                break;
+            }
+        }
     }
 
     void set_pretransposed_B_data(void *in_buffer) override {
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index ece9ca5802..834cd1061e 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,9 +113,17 @@ public:
     {
         return 0;
     }
+    /* Amount of work for the threaded cases */
+    virtual size_t get_B_pretranspose_window_size() const
+    {
+        return 1;
+    }
     /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
+    /* Threaded version with window start/end parameters */
+    virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
     virtual void set_pretransposed_B_data(void *)
     {
@@ -225,6 +233,20 @@ public:
         pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
     }
 
+    /* Threaded versions of the above.
+     * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
+     * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
+     * legal values for start and end are 0 and 1 respectively. */
+    virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+    {
+        pretranspose_B_array(out, in, row_stride, multi_stride);
+    };
+
+    void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+    {
+        pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
+    }
+
     /*** Indirect interface ***/
     virtual void set_indirect_parameters(size_t, const To *const *const *)
     {
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 9af98be41d..9c85631406 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -38,6 +38,46 @@ namespace arm_compute
 {
 namespace cpu
 {
+namespace
+{
+/** Run pretranspose_B_array in parallel (1D static scheduling)
+ *
+ * @tparam TypeInput
+ * @tparam TypeOutput
+ *
+ * @param[in] gemm_asm         GemmCommon kernel to run
+ * @param[in] dst              Pretransposed B array
+ * @param[in] src              B array to be pretransposed
+ * @param[in] src_ld           Stride in y
+ * @param[in] src_multi_stride Stride in z ("multi")
+ * @param[in] num_threads      Number of threads to run this method. Must be >= 1
+ */
+template <typename TypeInput, typename TypeOutput>
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
+{
+    ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_threads == 0);
+    // The window size is also the total workload size
+    const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
+
+    std::vector<IScheduler::Workload> workloads(num_threads);
+    for(unsigned int t = 0; t < num_threads; ++t)
+    {
+        workloads[t] = [ = ](const ThreadInfo & info)
+        {
+            const unsigned int start = (info.thread_id * wsize) / num_threads;
+            const unsigned int end   = ((info.thread_id + 1) * wsize) / num_threads;
+
+            if(start < end)
+            {
+                gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
+            }
+        };
+    }
+    NEScheduler::get().run_tagged_workloads(workloads, "CpuGemmAssemblyDispatch/pretranspose_B_array");
+}
+} // namespace
+
 using namespace arm_compute::experimental;
 
 namespace
@@ -436,7 +476,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
-            _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
+            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
 
             b->mark_as_unused();
         }
@@ -493,9 +533,9 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     // Check if B is pre-tranposed and de-reference if not
     if(!_gemm_kernel_asm->B_is_pretransposed())
     {
-        ldb                                = b->info()->strides_in_bytes().y() / b->info()->element_size();
-        multi_stride_b                     = b->info()->strides_in_bytes().z() / b->info()->element_size();
-        in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+        ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
+        multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+        in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
     }
 
     // If necessary, run pretranspose every time if either weights or biases are non-constant
@@ -522,7 +562,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
             }
             else
             {
-                _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
+                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
             }
         }
     }
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 39f41555fa..436fd9ca16 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -139,7 +139,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
                 default:
                     ARM_COMPUTE_ERROR("Unknown strategy");
             }
-            // Make sure the smallest window is larger than minimim workload size
+            // Make sure the smallest window is larger than minimum workload size
             num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
 
             std::vector<IScheduler::Workload> workloads(num_windows);
@@ -178,7 +178,7 @@ void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const ch
 std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info)
 {
     // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
-    if(window.num_iterations(split_dimension) < init_num_windows )
+    if(window.num_iterations(split_dimension) < init_num_windows)
     {
         auto recommended_split_dim = Window::DimX;
         for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
-- 
cgit v1.2.1