From d4a5bc5b87d970c196c6ea2a6b8bc2119005e0ca Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 12 Aug 2021 07:42:51 +0100
Subject: Ensure correct transformed matrices are used in CpuGemmConvolution

Resolves: COMPMID-4763

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Iae2e093cfb7d2c7172603897afe1c6a2e5d1caa3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/349725
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6101
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
---
 .../NEON/functions/NEGEMMConvolutionLayer.cpp      | 18 ++------------
 src/runtime/cpu/operators/CpuGemmConvolution.cpp   | 28 ++++++++++++++++------
 src/runtime/cpu/operators/CpuGemmConvolution.h     |  1 -
 .../operators/CpuGemmLowpMatrixMultiplyCore.cpp    |  1 +
 4 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index f63fcb02fd..c32584ec0d 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -39,7 +39,6 @@ struct NEGEMMConvolutionLayer::Impl
     const ITensor                           *weights{ nullptr };
     std::unique_ptr<cpu::CpuGemmConvolution> op{ nullptr };
     ITensorPack                              run_pack{};
-    ITensorPack                              prep_pack{};
     MemoryGroup                              memory_group{};
     IWeightsManager                         *weights_manager{ nullptr };
     MemoryRequirements                       aux_mem_req{};
@@ -70,13 +69,8 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
         { TensorType::ACL_SRC_2, biases },
         { TensorType::ACL_DST, output }
     };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-    };
     _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -96,15 +90,7 @@ void NEGEMMConvolutionLayer::prepare()
 {
     if(!_impl->is_prepared)
     {
-        _impl->op->prepare(_impl->prep_pack);
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
-
-        if(has_reshape != std::end(_impl->aux_mem_req))
-        {
-            _impl->weights->mark_as_unused();
-        }
+        _impl->op->prepare(_impl->run_pack);
 
         // Release temporary tensors that are only used in prepare stage
         release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
index 864d7e2d0b..81d656c905 100644
--- a/src/runtime/cpu/operators/CpuGemmConvolution.cpp
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
@@ -341,10 +341,16 @@ void CpuGemmConvolution::configure(const ITensorInfo *src, const ITensorInfo *we
         _reshape_kernel->configure(gemm_output_to_use, dst);
     }
 
+    // Check if GEMM transforms weights
+    // Modernise through COMPMID-4535
+    bool gemm_trans_wei = _aux_mem[1].size > 0;                                            // Asm Pretranspose
+    gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei;     // Tranpose RHS
+    gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
+
+    // Check lifetime
     _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Prepare, _weights_reshaped.total_size());
+    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
     _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
-    _aux_mem[GemmOutput3d]    = MemoryInfo(offset_int_vec(GemmOutput3d), MemoryLifetime::Temporary, _gemm_output_3d.total_size());
 }
 
 Status CpuGemmConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
@@ -493,6 +499,7 @@ void CpuGemmConvolution::run(ITensorPack &tensors)
 
     CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
     CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
     if(!_skip_im2col)
@@ -510,12 +517,15 @@ void CpuGemmConvolution::run(ITensorPack &tensors)
 
     // Handle the case where output has top/bottom padding
     const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst;
+    Tensor         gemm3d;
     _gemm_output_3d.extend_padding(out_to_use->info()->padding());
-    CpuAuxTensorHandler gemm_output_3d(offset_int_vec(GemmOutput3d), _gemm_output_3d, tensors, true);
-    auto                gemm_output_to_use = gemm_output.get();
+    gemm3d.allocator()->soft_init(_gemm_output_3d);
+    gemm3d.allocator()->import_memory(out_to_use->buffer());
+    auto gemm_output_to_use = gemm_output.get();
+
     if(_skip_im2col)
     {
-        gemm_output_to_use = gemm_output_3d.get();
+        gemm_output_to_use = &gemm3d;
     }
     if(_skip_col2im && !out_has_padding)
     {
@@ -525,6 +535,7 @@ void CpuGemmConvolution::run(ITensorPack &tensors)
     // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
     if(_is_quantized)
     {
@@ -583,10 +594,13 @@ void CpuGemmConvolution::prepare(ITensorPack &tensors)
             { TensorType::ACL_DST, weights_reshaped.get() }
         };
         NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
-        tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+        weights->mark_as_unused();
 
         // Prepare GEMM
-        _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+        _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack);
+
         _is_prepared = true;
     }
 }
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.h b/src/runtime/cpu/operators/CpuGemmConvolution.h
index 578586e7d1..7755bbe2a2 100644
--- a/src/runtime/cpu/operators/CpuGemmConvolution.h
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.h
@@ -174,7 +174,6 @@ private:
         Im2ColOutput = 9,
         WeightsReshaped,
         GemmOutput,
-        GemmOutput3d,
         Count
     };
 
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index f22446863c..7affc3f506 100644
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -529,6 +529,7 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
         };
         NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
         a_to_use = signed_a.get();
+        matrix_a = signed_a.get();
     }
 
     // Run GEMM
-- 
cgit v1.2.1