From 8ae3cdadbc96910171d35abaab633be03b07d6f4 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 7 Jun 2021 15:30:26 +0100 Subject: Revert "Implement memory injection in CpuDirectGemmConv2d" This reverts commit b3be45759bdd0749ae3a16fe470820f0d9830ea9. Resolves: COMPMID-4548 Change-Id: I46e0d8c67ddf988af3ce38f83177cda412db916c Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5775 Tested-by: Arm Jenkins Reviewed-by: Sheri Zhang --- src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp | 58 +++-------- src/runtime/cpu/operators/CpuGemmDirectConv2d.h | 17 ++- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 114 ++++++++------------- .../operators/internal/CpuGemmAssemblyDispatch.h | 18 ++-- 4 files changed, 74 insertions(+), 133 deletions(-) (limited to 'src/runtime/cpu') diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp index 7b7b68a93b..e50099df1f 100644 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp @@ -53,13 +53,11 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU }; - PixelValue type_min{}; PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get(); - int32_t max_activation = type_max.get(); + int32_t min_activation = type_min.get(); + int32_t max_activation = type_max.get(); if(supported_acts.count(act.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); @@ -89,8 +87,8 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect } } // namespace -CpuGemmDirectConv2d::CpuGemmDirectConv2d() - : _gemm_asm_func(std::make_unique()), +CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr &memory_manager) + : _gemm_asm_func(std::make_unique(memory_manager)), _activation_func(std::make_unique()), _weights_permute_func(std::make_unique()), _permuted_weights_info(), @@ -165,8 +163,6 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo * } void CpuGemmDirectConv2d::run(ITensorPack &tensors) { - import_workspace_memory(tensors); - prepare(tensors); _gemm_asm_func->run(tensors); @@ -174,14 +170,22 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors) { _activation_func->run(tensors); } +} - free_imported_workspace_memory(); +void CpuGemmDirectConv2d::allocate_permuted_weights() +{ + // TODO: This function will be removed when memory injection is implemeted. + ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr); + _permuted_weights->allocator()->free(); + _permuted_weights->allocator()->init(_permuted_weights_info); + _permuted_weights->allocator()->allocate(); } void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) { if(!_is_prepared) { + allocate_permuted_weights(); ITensorPack permute_tensors { { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, @@ -198,41 +202,5 @@ void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) } } -experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const -{ - experimental::MemoryRequirements req = _gemm_asm_func->workspace(); - - auto index = static_cast::type>(TensorType::ACL_INT_0); - - if(req.size() > 0) - { - index = req.back().slot + 1; - - constexpr auto max_index = static_cast::type>(TensorType::ACL_INT_4); - ARM_COMPUTE_UNUSED(max_index); // in order to prevent build error with assertion is disabled. - ARM_COMPUTE_ERROR_ON(index > max_index); - } - - req.emplace_back(index, _permuted_weights_info.total_size(), 0); - - return req; -} - -void CpuGemmDirectConv2d::import_workspace_memory(ITensorPack &tensors) -{ - auto imported_tensor = tensors.get_tensor(workspace().back().slot); - - ARM_COMPUTE_ERROR_ON_NULLPTR(imported_tensor); - - auto imported_memory = imported_tensor->buffer(); - _permuted_weights->allocator()->init(_permuted_weights_info); - _permuted_weights->allocator()->import_memory(imported_memory); -} - -void CpuGemmDirectConv2d::free_imported_workspace_memory() -{ - _permuted_weights->allocator()->free(); -} - } // namespace cpu } // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h index 305a076908..6aa17c2349 100644 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h @@ -48,7 +48,7 @@ class CpuGemmDirectConv2d : public ICpuOperator { public: /** Constructor */ - CpuGemmDirectConv2d(); + CpuGemmDirectConv2d(const std::shared_ptr &memory_manager = nullptr); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); /** Destructor */ ~CpuGemmDirectConv2d(); @@ -80,16 +80,15 @@ public: void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d * - * Similar to @ref CpuGemmDirectConv2d::configure() + * Similar to CpuGemmDirectConv2d::configure() * * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - experimental::MemoryRequirements workspace() const override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; private: std::unique_ptr _gemm_asm_func; @@ -101,13 +100,11 @@ private: bool _is_prepared{ false }; bool _run_activation{ false }; - /** Function to import workspace tensors + /** Function to allocated a tensor for permuted weights * - * @param[in] tensors Tensor pack includes workspace tensors + * @note This function will be removed when memory injection is properly implemented. */ - void import_workspace_memory(ITensorPack &tensors); - /** Function free used workspace tensors */ - void free_imported_workspace_memory(); + void allocate_permuted_weights(); }; } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 53d71a3b80..ea3742fee5 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -204,11 +204,11 @@ public: } private: - Tensor _output{}; - int _ldb{}; - const TypeInput *_in1_ptr{}; - int _multi_stride_b{}; - size_t _B_pretranspose_size{}; + Tensor _output{}; + int _ldb{}; + const TypeInput *_in1_ptr{}; + int _multi_stride_b{}; + size_t _B_pretranspose_size{}; std::shared_ptr> _gemm_kernel_asm{ nullptr }; }; @@ -240,7 +240,7 @@ public: */ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - IWeightsManager *weights_manager, const OutputStage &os = {}); + MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); /** Set requantization shifts to be used * @@ -265,42 +265,13 @@ public: bool is_configured() const override; private: - static constexpr size_t _workspace_alignment{ 4096 }; - /** Function to get the memory requirements */ - experimental::MemoryRequirements get_workspace() const override - { - experimental::MemoryRequirements req{}; - const auto size = _gemm_kernel_asm->get_working_size(); - if(size > 0) - { - req.emplace_back(TensorType::ACL_INT, size, _workspace_alignment); - } - return req; - } - - /** Function to import workspace tensors + /** Allocate a workspace tensor. * - * @param[in] tensors Tensor pack includes workspace tensors + * @param[in] workspace_size Size to allocate. + * @param[in] memory_group Tensor memory group. + * @param[in] alignment Workspace memory alignment. */ - void import_workspace(ITensorPack &tensors) - { - const auto size = _gemm_kernel_asm->get_working_size(); - - if(size > 0) - { - auto imported_tensor = tensors.get_tensor(TensorType::ACL_INT); - ARM_COMPUTE_ERROR_ON_NULLPTR(imported_tensor); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + _workspace_alignment) }, 1, DataType::S8), _workspace_alignment); - _workspace.allocator()->import_memory(imported_tensor->buffer()); - } - } - /** Function free used workspace tensors */ - void free_imported_workspace() - { - _workspace.allocator()->free(); - } - + void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); /** Configure the indirect buffer * * @param[in] a Input tensor containing the Matrix A. @@ -339,8 +310,8 @@ private: /** Indirect buffer */ std::unique_ptr _indirect_arg{}; std::unique_ptr _indirect_buf{}; - std::vector _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; + std::vector _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; bool is_weight_managed() { @@ -362,9 +333,6 @@ private: } }; -template -constexpr size_t Fallback::_workspace_alignment; - template std::tuple Fallback::set_requantize_data(const std::vector &shifts, const std::vector &multipliers) @@ -502,7 +470,7 @@ void Fallback::configure_indirect(const ITen template void Fallback::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - IWeightsManager *weights_manager, const OutputStage &os) + MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) { ARM_COMPUTE_UNUSED(c); arm_gemm::GemmConfig gemm_cfg; @@ -524,6 +492,13 @@ void Fallback::configure(const ITensorInfo * auto acl_gemm_wrapper = std::make_unique>(); ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); + const size_t workspace_size = _gemm_kernel_asm->get_working_size(); + if(workspace_size > 0) + { + // Allocate workspace + const unsigned int alignment = 4096; + allocate_workspace(workspace_size, memory_group, alignment); + } //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 @@ -611,6 +586,15 @@ void Fallback::prepare(ITensorPack &tensors) } } +template +void Fallback::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment) +{ + ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); + _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment); + memory_group.manage(&_workspace); + _workspace.allocator()->allocate(); +} + template bool Fallback::is_configured() const { @@ -625,10 +609,6 @@ void Fallback::run(ITensorPack &tensors) auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto d = tensors.get_tensor(TensorType::ACL_DST); - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - - import_workspace(tensors); - int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput); int ldb = 0; const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput); @@ -704,11 +684,10 @@ void Fallback::run(ITensorPack &tensors) bias, 0); // Schedule NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); - free_imported_workspace(); } template -void create_arm_gemm(std::unique_ptr &arm_gemm, +void create_arm_gemm(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { @@ -720,12 +699,12 @@ void create_arm_gemm(std::unique_ptr &arm_ge // Create arm_gemm fallback auto fallback = std::make_unique>(); - fallback->configure(a, b, c, d, args, info, weights_manager); + fallback->configure(a, b, c, d, args, info, memory_group, weights_manager); arm_gemm = std::move(fallback); } template -void create_arm_gemm_quant(std::unique_ptr &arm_gemm, +void create_arm_gemm_quant(std::unique_ptr &arm_gemm, MemoryGroup &memory_group, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { @@ -765,14 +744,14 @@ void create_arm_gemm_quant(std::unique_ptr & } // Configure fallback - fallback->configure(a, b, c, d, args, info, weights_manager, gemm_requant_info); + fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info); arm_gemm = std::move(fallback); } } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(IWeightsManager *weights_manager) - : _arm_gemm(nullptr), _weights_manager(weights_manager) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr memory_manager, IWeightsManager *weights_manager) + : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager) { } @@ -827,40 +806,40 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo switch(a->data_type()) { case DataType::F32: - create_arm_gemm(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: if(d->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } else { - create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: if(d->data_type() == DataType::S32) { - create_arm_gemm(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } else { - create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); } break; #endif /* __aarch64__ */ #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) case DataType::BFLOAT16: - create_arm_gemm(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); break; #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - create_arm_gemm(_arm_gemm, a, b, c, d, act, info, _weights_manager); + create_arm_gemm(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -881,13 +860,10 @@ bool CpuGemmAssemblyDispatch::is_configured() const void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) { + MemoryGroupResourceScope scope_mg(_memory_group); + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); _arm_gemm->run(tensors); } - -experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const -{ - return is_configured() ? _arm_gemm->get_workspace() : experimental::MemoryRequirements{}; -} } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 154def6708..ffc097c75c 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -24,6 +24,7 @@ #ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H +#include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" @@ -61,7 +62,7 @@ class CpuGemmAssemblyDispatch : public ICpuOperator { public: /** Constructor */ - CpuGemmAssemblyDispatch(IWeightsManager *weights_manager = nullptr); + CpuGemmAssemblyDispatch(std::shared_ptr memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); /** Defautl destructor */ ~CpuGemmAssemblyDispatch() = default; @@ -70,11 +71,10 @@ public: class IFallback { public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual bool is_configured() const = 0; - virtual ~IFallback() = default; - virtual experimental::MemoryRequirements get_workspace() const = 0; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual bool is_configured() const = 0; + virtual ~IFallback() = default; }; public: @@ -113,12 +113,12 @@ public: bool is_configured() const; // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; private: std::unique_ptr _arm_gemm; /**< Interface for the arm_gemm fallback */ + MemoryGroup _memory_group; /**< Function memory group */ IWeightsManager *_weights_manager; /**< Pointer to the weights manager */ }; } // namespace cpu -- cgit v1.2.1