Integrate multi-threaded pretranspose_B_array

This is required for the case where rhs (B) is dynamic and needs to be pretransposed in every run. In a multi-threaded setting, this means the previously single-threaded pretranspose_B_array would become the bottleneck Resolves COMPMID-5896 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: Id508c46992188a0f76a505152931d4955d04c16d Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9455 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2023-04-06 16:30:18 +0100
committer: SiCong Li <sicong.li@arm.com> 2023-04-26 09:10:38 +0000
commit: dba672cec878966e465bb476e896c8f75bbd9145 (patch)
tree: fcc8df3dc3f3799a616d2a10d52dd9bfdf6d2e33 /src/cpu/kernels/assembly
parent: 7fefac722568d997b4d9e136925e93c7abeb564a (diff)
download: ComputeLibrary-dba672cec878966e465bb476e896c8f75bbd9145.tar.gz
1 files changed, 23 insertions, 1 deletions
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index ece9ca5802..834cd1061e 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,9 +113,17 @@ public:
     {
         return 0;
     }
+    /* Amount of work for the threaded cases */
+    virtual size_t get_B_pretranspose_window_size() const
+    {
+        return 1;
+    }
     /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
+    /* Threaded version with window start/end parameters */
+    virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
     virtual void set_pretransposed_B_data(void *)
     {
@@ -225,6 +233,20 @@ public:
         pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
     }
 
+    /* Threaded versions of the above.
+     * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
+     * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
+     * legal values for start and end are 0 and 1 respectively. */
+    virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+    {
+        pretranspose_B_array(out, in, row_stride, multi_stride);
+    };
+
+    void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+    {
+        pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
+    }
+
     /*** Indirect interface ***/
     virtual void set_indirect_parameters(size_t, const To *const *const *)
     {
author	SiCong Li <sicong.li@arm.com>	2023-04-06 16:30:18 +0100
committer	SiCong Li <sicong.li@arm.com>	2023-04-26 09:10:38 +0000
commit	dba672cec878966e465bb476e896c8f75bbd9145 (patch)
tree	fcc8df3dc3f3799a616d2a10d52dd9bfdf6d2e33 /src/cpu/kernels/assembly
parent	7fefac722568d997b4d9e136925e93c7abeb564a (diff)
download	ComputeLibrary-dba672cec878966e465bb476e896c8f75bbd9145.tar.gz