From d4a5bc5b87d970c196c6ea2a6b8bc2119005e0ca Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 12 Aug 2021 07:42:51 +0100 Subject: Ensure correct transformed matrices are used in CpuGemmConvolution Resolves: COMPMID-4763 Signed-off-by: Georgios Pinitas Change-Id: Iae2e093cfb7d2c7172603897afe1c6a2e5d1caa3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/349725 Tested-by: bsgcomp Reviewed-by: Pablo Tello Comments-Addressed: bsgcomp Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6101 Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez Tello --- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 18 ++------------ src/runtime/cpu/operators/CpuGemmConvolution.cpp | 28 ++++++++++++++++------ src/runtime/cpu/operators/CpuGemmConvolution.h | 1 - .../operators/CpuGemmLowpMatrixMultiplyCore.cpp | 1 + 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index f63fcb02fd..c32584ec0d 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -39,7 +39,6 @@ struct NEGEMMConvolutionLayer::Impl const ITensor *weights{ nullptr }; std::unique_ptr op{ nullptr }; ITensorPack run_pack{}; - ITensorPack prep_pack{}; MemoryGroup memory_group{}; IWeightsManager *weights_manager{ nullptr }; MemoryRequirements aux_mem_req{}; @@ -70,13 +69,8 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - }; _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, @@ -96,15 +90,7 @@ void NEGEMMConvolutionLayer::prepare() { if(!_impl->is_prepared) { - _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - - if(has_reshape != std::end(_impl->aux_mem_req)) - { - _impl->weights->mark_as_unused(); - } + _impl->op->prepare(_impl->run_pack); // Release temporary tensors that are only used in prepare stage release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors); diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp index 864d7e2d0b..81d656c905 100644 --- a/src/runtime/cpu/operators/CpuGemmConvolution.cpp +++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp @@ -341,10 +341,16 @@ void CpuGemmConvolution::configure(const ITensorInfo *src, const ITensorInfo *we _reshape_kernel->configure(gemm_output_to_use, dst); } + // Check if GEMM transforms weights + // Modernise through COMPMID-4535 + bool gemm_trans_wei = _aux_mem[1].size > 0; // Asm Pretranspose + gemm_trans_wei = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei; // Tranpose RHS + gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS + + // Check lifetime _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Prepare, _weights_reshaped.total_size()); + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size()); _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); - _aux_mem[GemmOutput3d] = MemoryInfo(offset_int_vec(GemmOutput3d), MemoryLifetime::Temporary, _gemm_output_3d.total_size()); } Status CpuGemmConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, @@ -493,6 +499,7 @@ void CpuGemmConvolution::run(ITensorPack &tensors) CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false); CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false); + CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); if(!_skip_im2col) @@ -510,12 +517,15 @@ void CpuGemmConvolution::run(ITensorPack &tensors) // Handle the case where output has top/bottom padding const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst; + Tensor gemm3d; _gemm_output_3d.extend_padding(out_to_use->info()->padding()); - CpuAuxTensorHandler gemm_output_3d(offset_int_vec(GemmOutput3d), _gemm_output_3d, tensors, true); - auto gemm_output_to_use = gemm_output.get(); + gemm3d.allocator()->soft_init(_gemm_output_3d); + gemm3d.allocator()->import_memory(out_to_use->buffer()); + auto gemm_output_to_use = gemm_output.get(); + if(_skip_im2col) { - gemm_output_to_use = gemm_output_3d.get(); + gemm_output_to_use = &gemm3d; } if(_skip_col2im && !out_has_padding) { @@ -525,6 +535,7 @@ void CpuGemmConvolution::run(ITensorPack &tensors) // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); + pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); if(_is_quantized) { @@ -583,10 +594,13 @@ void CpuGemmConvolution::prepare(ITensorPack &tensors) { TensorType::ACL_DST, weights_reshaped.get() } }; NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack); - tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); + weights->mark_as_unused(); // Prepare GEMM - _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors); + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); + _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack); + _is_prepared = true; } } diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.h b/src/runtime/cpu/operators/CpuGemmConvolution.h index 578586e7d1..7755bbe2a2 100644 --- a/src/runtime/cpu/operators/CpuGemmConvolution.h +++ b/src/runtime/cpu/operators/CpuGemmConvolution.h @@ -174,7 +174,6 @@ private: Im2ColOutput = 9, WeightsReshaped, GemmOutput, - GemmOutput3d, Count }; diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index f22446863c..7affc3f506 100644 --- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -529,6 +529,7 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) }; NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack); a_to_use = signed_a.get(); + matrix_a = signed_a.get(); } // Run GEMM -- cgit v1.2.1