From ef637398a8c2060e15de438020c53331da8bd6dd Mon Sep 17 00:00:00 2001
From: Gunes Bayir <gunes.bayir@arm.com>
Date: Mon, 12 Feb 2024 21:32:51 +0000
Subject: Integrate new pretranspose_b_array with extra fused transpose of B

This patch fuses the transposition taking place in Acl with the transformations done in arm_gemm (called pretranspose_b_array) if the underlying kernel and transform supports it. This should improve start-up time (as it's for constant Rhs matrices) and memory footprint. The transformations in arm_gemm are kernel specific. The Rhs matrix is transformed into certain layouts to improve the performance.

Resolves: COMPMID-6595

Change-Id: Id2932dd966e59f903c279417bebcea83d9a42464
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11144
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp')
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
index afe24e7ce0..40f61626a1 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,9 +60,14 @@ public:
         ConvolutionInterleave<height_vectors, block, VLType::SME>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
+    bool PrepareB_supports_transpose() const {
+        return false;
+    }
+
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
-                  const int xmax, const int k0, const int kmax) {
+                  const int xmax, const int k0, const int kmax, bool transposed) {
+        assert (!transposed);
         Transform<width_vectors, block,  true, VLType::SME>(out, in, stride, x0, xmax, k0, kmax);
     }
 
-- 
cgit v1.2.1