From ef637398a8c2060e15de438020c53331da8bd6dd Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Mon, 12 Feb 2024 21:32:51 +0000 Subject: Integrate new pretranspose_b_array with extra fused transpose of B This patch fuses the transposition taking place in Acl with the transformations done in arm_gemm (called pretranspose_b_array) if the underlying kernel and transform supports it. This should improve start-up time (as it's for constant Rhs matrices) and memory footprint. The transformations in arm_gemm are kernel specific. The Rhs matrix is transformed into certain layouts to improve the performance. Resolves: COMPMID-6595 Change-Id: Id2932dd966e59f903c279417bebcea83d9a42464 Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11144 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp index 4669be9993..a9cbf4ec8d 100644 --- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp +++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2018-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,9 +63,14 @@ public: ConvolutionInterleave(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier); } + bool PrepareB_supports_transpose() const { + return false; + } + template void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0, - const int xmax, const int k0, const int kmax) const { + const int xmax, const int k0, const int kmax, bool transposed) const { + assert(!transposed); Transform(out, in, stride, x0, xmax, k0, kmax); } -- cgit v1.2.1