aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-08-12 06:28:09 +0100
committerSiCongLi <sicong.li@arm.com>2021-08-12 15:27:58 +0100
commit66cb91baacf4dadf1b467bc287e7e9bf17313aa1 (patch)
tree2b6a64b43afea654f423c64ac96a21b9d31a45c2
parent593c2425e6b94828fb486244e42c275a89a71aff (diff)
downloadComputeLibrary-66cb91baacf4dadf1b467bc287e7e9bf17313aa1.tar.gz
Ensure that correct transformed matrices are used in CpuFullyConnected
Execution pack of CpuFullyConnected was altered explicitly with local objects that were getting out of scope. Leading to incorrect results or memory related issues. Track transformed weights and register the weights matrix explicitly during execution honoring the object lifetime scope. Resolves: COMPMID-4762, COMPMID-4764 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I53449c377fb1cfccdf5e6f9505d963518748c318 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/349345 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6092 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp19
-rw-r--r--src/runtime/cpu/operators/CpuFullyConnected.cpp56
-rw-r--r--src/runtime/cpu/operators/CpuFullyConnected.h12
-rw-r--r--src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp9
-rw-r--r--src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp2
5 files changed, 45 insertions, 53 deletions
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index d815a73b93..504200e9ce 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -44,7 +44,6 @@ struct NEFullyConnectedLayer::Impl
const ITensor *original_weights{ nullptr };
ITensorPack run_pack{};
- ITensorPack prep_pack{};
WorkspaceData<Tensor> workspace{};
experimental::MemoryRequirements aux_mem_req{};
@@ -79,8 +78,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
_impl->aux_mem_req = _impl->op->workspace();
_impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
- _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
- _impl->workspace = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ _impl->workspace = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -101,20 +99,7 @@ void NEFullyConnectedLayer::prepare()
{
if(!_impl->is_prepared)
{
- _impl->op->prepare(_impl->prep_pack);
-
- auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
- _impl->aux_mem_req.end(),
- [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
-
- if(has_reshape != std::end(_impl->aux_mem_req))
- {
- _impl->original_weights->mark_as_unused();
- }
- else
- {
- _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_weights);
- }
+ _impl->op->prepare(_impl->run_pack);
// Release temporary tensors that are only used in prepare stage
release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
diff --git a/src/runtime/cpu/operators/CpuFullyConnected.cpp b/src/runtime/cpu/operators/CpuFullyConnected.cpp
index e7808fbc82..eeabce0753 100644
--- a/src/runtime/cpu/operators/CpuFullyConnected.cpp
+++ b/src/runtime/cpu/operators/CpuFullyConnected.cpp
@@ -150,9 +150,11 @@ CpuFullyConnected::CpuFullyConnected()
_flattened_src(),
_converted_weights(),
_reshaped_weights(),
+ _trans_weights(),
+ _trans_weights_idx(AuxTensorIdx::Count),
_aux_mem(Count),
- _are_weights_converted(false),
- _are_weights_reshaped(false),
+ _needs_weights_conversion(false),
+ _needs_weights_reshape(false),
_is_fc_after_conv(false),
_is_quantized_asymmetric(false),
_is_prepared(false)
@@ -230,11 +232,13 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
dst,
fc_info));
- _are_weights_converted = true;
- _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- _is_fc_after_conv = true;
- _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
- _is_prepared = false;
+ _needs_weights_conversion = false;
+ _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+ _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights;
+ _is_fc_after_conv = true;
+ _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
+ _is_prepared = false;
+ _trans_weights_idx = AuxTensorIdx::Count;
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
@@ -258,12 +262,13 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Reshape weights if needed
- if(!_are_weights_reshaped)
+ if(_needs_weights_reshape)
{
// Reshape the weights
_transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
_transpose_weights->configure(weights, &_reshaped_weights);
- weights_to_use = &_reshaped_weights;
+ weights_to_use = &_reshaped_weights;
+ _trans_weights_idx = AuxTensorIdx::TransposedWeights;
}
// Convert weights if needed
@@ -276,8 +281,9 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
src->tensor_shape(),
fc_info.weights_trained_layout);
- weights_to_use = &_converted_weights;
- _are_weights_converted = false;
+ weights_to_use = &_converted_weights;
+ _needs_weights_conversion = true;
+ _trans_weights_idx = AuxTensorIdx::ConvertedWeights;
}
if(_is_fc_after_conv)
@@ -291,7 +297,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
}
- _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+ // Retain the tensorinfo with the weights to use
+ if(_needs_weights_reshape || _needs_weights_conversion)
+ {
+ _trans_weights = *weights_to_use;
+ }
// Set auxiliary memory requirements
auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
@@ -308,7 +318,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
else
{
- _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Persistent, _reshaped_weights.total_size());
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size());
_aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size());
}
_aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
@@ -401,6 +411,7 @@ void CpuFullyConnected::run(ITensorPack &tensors)
auto src = tensors.get_const_tensor(ACL_SRC_0);
CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+ CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
// Linearize src if it comes from a convolutional layer
if(_is_fc_after_conv)
@@ -411,6 +422,10 @@ void CpuFullyConnected::run(ITensorPack &tensors)
ITensorPack gemm_pack = tensors;
gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+ if(_needs_weights_reshape || _needs_weights_conversion)
+ {
+ gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
+ }
// Run matrix multiply
if(_is_quantized_asymmetric)
@@ -436,7 +451,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
const ITensor *cur_weights = weights;
// Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
+ if(_needs_weights_reshape)
{
// Run reshape weights kernel and mark weights as unused
ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
@@ -444,32 +459,29 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
cur_weights->mark_as_unused();
cur_weights = reshaped_weights.get();
-
- _are_weights_reshaped = true;
}
// Convert weights if needed (happens only once)
- if(!_are_weights_converted)
+ if(_needs_weights_conversion)
{
ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
_convert_weights->run(convert_pack);
cur_weights->mark_as_unused();
cur_weights = converted_weights.get();
-
- _are_weights_converted = true;
}
- tensors.add_const_tensor(ACL_SRC_1, cur_weights);
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
// Prepare GEMM prepare and release unused weights
if(!_is_quantized_asymmetric)
{
- _mm_gemm->prepare(tensors);
+ _mm_gemm->prepare(gemm_pack);
}
else
{
- _mm_gemmlowp->prepare(tensors);
+ _mm_gemmlowp->prepare(gemm_pack);
}
_is_prepared = true;
diff --git a/src/runtime/cpu/operators/CpuFullyConnected.h b/src/runtime/cpu/operators/CpuFullyConnected.h
index 954a7b7ffc..498ceae68d 100644
--- a/src/runtime/cpu/operators/CpuFullyConnected.h
+++ b/src/runtime/cpu/operators/CpuFullyConnected.h
@@ -128,14 +128,16 @@ private:
std::unique_ptr<CpuGemm> _mm_gemm;
std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp;
- TensorInfo _flattened_src;
- TensorInfo _converted_weights;
- TensorInfo _reshaped_weights;
+ TensorInfo _flattened_src;
+ TensorInfo _converted_weights;
+ TensorInfo _reshaped_weights;
+ TensorInfo _trans_weights;
+ AuxTensorIdx _trans_weights_idx;
experimental::MemoryRequirements _aux_mem;
- bool _are_weights_converted;
- bool _are_weights_reshaped;
+ bool _needs_weights_conversion;
+ bool _needs_weights_reshape;
bool _is_fc_after_conv;
bool _is_quantized_asymmetric;
bool _is_prepared;
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8adf7047fd..f22446863c 100644
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -672,15 +672,6 @@ void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
if(_asm_glue->is_configured())
{
_asm_glue->prepare(tensors);
-
- auto has_reshape = std::find_if(_aux_mem.begin(),
- _aux_mem.end(),
- [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
-
- if(has_reshape != std::end(_aux_mem))
- {
- original_b->mark_as_unused();
- }
}
// Run non-assembly reshape
else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index bbbd5ac458..9786161dee 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -424,6 +424,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
_gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
+
+ b->mark_as_unused();
}
if(_gemm_info.method == AsmConvMethod::Indirect)