diff options
5 files changed, 45 insertions, 53 deletions
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index d815a73b93..504200e9ce 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -44,7 +44,6 @@ struct NEFullyConnectedLayer::Impl const ITensor *original_weights{ nullptr }; ITensorPack run_pack{}; - ITensorPack prep_pack{}; WorkspaceData<Tensor> workspace{}; experimental::MemoryRequirements aux_mem_req{}; @@ -79,8 +78,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh _impl->aux_mem_req = _impl->op->workspace(); _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, @@ -101,20 +99,7 @@ void NEFullyConnectedLayer::prepare() { if(!_impl->is_prepared) { - _impl->op->prepare(_impl->prep_pack); - - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - - if(has_reshape != std::end(_impl->aux_mem_req)) - { - _impl->original_weights->mark_as_unused(); - } - else - { - _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_weights); - } + _impl->op->prepare(_impl->run_pack); // Release temporary tensors that are only used in prepare stage release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace); diff --git a/src/runtime/cpu/operators/CpuFullyConnected.cpp b/src/runtime/cpu/operators/CpuFullyConnected.cpp index e7808fbc82..eeabce0753 100644 --- a/src/runtime/cpu/operators/CpuFullyConnected.cpp +++ b/src/runtime/cpu/operators/CpuFullyConnected.cpp @@ -150,9 +150,11 @@ CpuFullyConnected::CpuFullyConnected() _flattened_src(), _converted_weights(), _reshaped_weights(), + _trans_weights(), + _trans_weights_idx(AuxTensorIdx::Count), _aux_mem(Count), - _are_weights_converted(false), - _are_weights_reshaped(false), + _needs_weights_conversion(false), + _needs_weights_reshape(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), _is_prepared(false) @@ -230,11 +232,13 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei dst, fc_info)); - _are_weights_converted = true; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); - _is_prepared = false; + _needs_weights_conversion = false; + _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; + _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights; + _is_fc_after_conv = true; + _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); + _is_prepared = false; + _trans_weights_idx = AuxTensorIdx::Count; // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -258,12 +262,13 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Reshape weights if needed - if(!_are_weights_reshaped) + if(_needs_weights_reshape) { // Reshape the weights _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>(); _transpose_weights->configure(weights, &_reshaped_weights); - weights_to_use = &_reshaped_weights; + weights_to_use = &_reshaped_weights; + _trans_weights_idx = AuxTensorIdx::TransposedWeights; } // Convert weights if needed @@ -276,8 +281,9 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei src->tensor_shape(), fc_info.weights_trained_layout); - weights_to_use = &_converted_weights; - _are_weights_converted = false; + weights_to_use = &_converted_weights; + _needs_weights_conversion = true; + _trans_weights_idx = AuxTensorIdx::ConvertedWeights; } if(_is_fc_after_conv) @@ -291,7 +297,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info); } - _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + // Retain the tensorinfo with the weights to use + if(_needs_weights_reshape || _needs_weights_conversion) + { + _trans_weights = *weights_to_use; + } // Set auxiliary memory requirements auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); @@ -308,7 +318,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } else { - _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Persistent, _reshaped_weights.total_size()); + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size()); _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size()); } _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); @@ -401,6 +411,7 @@ void CpuFullyConnected::run(ITensorPack &tensors) auto src = tensors.get_const_tensor(ACL_SRC_0); CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); + CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); // Linearize src if it comes from a convolutional layer if(_is_fc_after_conv) @@ -411,6 +422,10 @@ void CpuFullyConnected::run(ITensorPack &tensors) ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); + if(_needs_weights_reshape || _needs_weights_conversion) + { + gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); + } // Run matrix multiply if(_is_quantized_asymmetric) @@ -436,7 +451,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) + if(_needs_weights_reshape) { // Run reshape weights kernel and mark weights as unused ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; @@ -444,32 +459,29 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) cur_weights->mark_as_unused(); cur_weights = reshaped_weights.get(); - - _are_weights_reshaped = true; } // Convert weights if needed (happens only once) - if(!_are_weights_converted) + if(_needs_weights_conversion) { ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); cur_weights = converted_weights.get(); - - _are_weights_converted = true; } - tensors.add_const_tensor(ACL_SRC_1, cur_weights); + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights if(!_is_quantized_asymmetric) { - _mm_gemm->prepare(tensors); + _mm_gemm->prepare(gemm_pack); } else { - _mm_gemmlowp->prepare(tensors); + _mm_gemmlowp->prepare(gemm_pack); } _is_prepared = true; diff --git a/src/runtime/cpu/operators/CpuFullyConnected.h b/src/runtime/cpu/operators/CpuFullyConnected.h index 954a7b7ffc..498ceae68d 100644 --- a/src/runtime/cpu/operators/CpuFullyConnected.h +++ b/src/runtime/cpu/operators/CpuFullyConnected.h @@ -128,14 +128,16 @@ private: std::unique_ptr<CpuGemm> _mm_gemm; std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp; - TensorInfo _flattened_src; - TensorInfo _converted_weights; - TensorInfo _reshaped_weights; + TensorInfo _flattened_src; + TensorInfo _converted_weights; + TensorInfo _reshaped_weights; + TensorInfo _trans_weights; + AuxTensorIdx _trans_weights_idx; experimental::MemoryRequirements _aux_mem; - bool _are_weights_converted; - bool _are_weights_reshaped; + bool _needs_weights_conversion; + bool _needs_weights_reshape; bool _is_fc_after_conv; bool _is_quantized_asymmetric; bool _is_prepared; diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 8adf7047fd..f22446863c 100644 --- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -672,15 +672,6 @@ void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) if(_asm_glue->is_configured()) { _asm_glue->prepare(tensors); - - auto has_reshape = std::find_if(_aux_mem.begin(), - _aux_mem.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - - if(has_reshape != std::end(_aux_mem)) - { - original_b->mark_as_unused(); - } } // Run non-assembly reshape else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index bbbd5ac458..9786161dee 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -424,6 +424,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b); + + b->mark_as_unused(); } if(_gemm_info.method == AsmConvMethod::Indirect) |