diff options
author | Sang-Hoon Park <sang-hoon.park@arm.com> | 2021-05-18 10:46:00 +0100 |
---|---|---|
committer | Pablo Marquez Tello <pablo.tello@arm.com> | 2021-05-27 16:33:44 +0000 |
commit | b3be45759bdd0749ae3a16fe470820f0d9830ea9 (patch) | |
tree | 10bb8c1c0a049a23c00781c64e993f1b197c0d05 /src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp | |
parent | bc91297c865808ed2c321febc405179f63195ff8 (diff) | |
download | ComputeLibrary-b3be45759bdd0749ae3a16fe470820f0d9830ea9.tar.gz |
Implement memory injection in CpuDirectGemmConv2d
The following operators are now stateless by implementing
memory injection.
- CpuDirectGemmConv2d
- CpuGemmAssemblyDispatch
A test case is added to test if CpuDirectGemmConv2d can
run on different group of tensors with a single configure.
Resolves: COMPMID-4506
Change-Id: I48f44ed41236ca7e18da2de07bdbacc9007a3c5e
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5718
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Diffstat (limited to 'src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp')
-rw-r--r-- | src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp | 100 |
1 files changed, 62 insertions, 38 deletions
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 0c511ff548..53d71a3b80 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -240,7 +240,7 @@ public: */ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {}); + IWeightsManager *weights_manager, const OutputStage &os = {}); /** Set requantization shifts to be used * @@ -265,13 +265,42 @@ public: bool is_configured() const override; private: - /** Allocate a workspace tensor. + static constexpr size_t _workspace_alignment{ 4096 }; + /** Function to get the memory requirements */ + experimental::MemoryRequirements get_workspace() const override + { + experimental::MemoryRequirements req{}; + const auto size = _gemm_kernel_asm->get_working_size(); + if(size > 0) + { + req.emplace_back(TensorType::ACL_INT, size, _workspace_alignment); + } + return req; + } + + /** Function to import workspace tensors * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. + * @param[in] tensors Tensor pack includes workspace tensors */ - void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment); + void import_workspace(ITensorPack &tensors) + { + const auto size = _gemm_kernel_asm->get_working_size(); + + if(size > 0) + { + auto imported_tensor = tensors.get_tensor(TensorType::ACL_INT); + ARM_COMPUTE_ERROR_ON_NULLPTR(imported_tensor); + const size_t workspace_size = _gemm_kernel_asm->get_working_size(); + _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + _workspace_alignment) }, 1, DataType::S8), _workspace_alignment); + _workspace.allocator()->import_memory(imported_tensor->buffer()); + } + } + /** Function free used workspace tensors */ + void free_imported_workspace() + { + _workspace.allocator()->free(); + } + /** Configure the indirect buffer * * @param[in] a Input tensor containing the Matrix A. @@ -334,6 +363,9 @@ private: }; template <typename TypeInput, typename TypeOutput, class OutputStage> +constexpr size_t Fallback<TypeInput, TypeOutput, OutputStage>::_workspace_alignment; + +template <typename TypeInput, typename TypeOutput, class OutputStage> std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers) { @@ -470,7 +502,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen template <typename TypeInput, typename TypeOutput, class OutputStage> void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os) + IWeightsManager *weights_manager, const OutputStage &os) { ARM_COMPUTE_UNUSED(c); arm_gemm::GemmConfig gemm_cfg; @@ -492,13 +524,6 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>(); ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - if(workspace_size > 0) - { - // Allocate workspace - const unsigned int alignment = 4096; - allocate_workspace(workspace_size, memory_group, alignment); - } //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 @@ -587,15 +612,6 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment) -{ - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment); - memory_group.manage(&_workspace); - _workspace.allocator()->allocate(); -} - -template <typename TypeInput, typename TypeOutput, class OutputStage> bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const { return _optimised_kernel != nullptr; @@ -609,6 +625,10 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto d = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + + import_workspace(tensors); + int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput); int ldb = 0; const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput); @@ -684,10 +704,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) bias, 0); // Schedule NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); + free_imported_workspace(); } template <typename TypeInput, typename TypeOutput> -void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, +void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { @@ -699,12 +720,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager); + fallback->configure(a, b, c, d, args, info, weights_manager); arm_gemm = std::move(fallback); } template <typename TypeInput, typename TypeOutput> -void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, +void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info, IWeightsManager *weights_manager) { @@ -744,14 +765,14 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & } // Configure fallback - fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info); + fallback->configure(a, b, c, d, args, info, weights_manager, gemm_requant_info); arm_gemm = std::move(fallback); } } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(IWeightsManager *weights_manager) + : _arm_gemm(nullptr), _weights_manager(weights_manager) { } @@ -806,40 +827,40 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo switch(a->data_type()) { case DataType::F32: - create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info, _weights_manager); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: if(d->data_type() == DataType::S32) { - create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info, _weights_manager); } else { - create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info, _weights_manager); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: if(d->data_type() == DataType::S32) { - create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info, _weights_manager); } else { - create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info, _weights_manager); } break; #endif /* __aarch64__ */ #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) case DataType::BFLOAT16: - create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info, _weights_manager); break; #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager); + create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info, _weights_manager); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -860,10 +881,13 @@ bool CpuGemmAssemblyDispatch::is_configured() const void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) { - MemoryGroupResourceScope scope_mg(_memory_group); - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); _arm_gemm->run(tensors); } + +experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const +{ + return is_configured() ? _arm_gemm->get_workspace() : experimental::MemoryRequirements{}; +} } // namespace cpu } // namespace arm_compute |