diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-07-08 18:14:45 +0100 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-07-16 14:39:47 +0000 |
commit | 2b147ee857eb237613670460c52efedd43601955 (patch) | |
tree | 2c2f66754dca6d83e4967daae600e84bca8ca6b4 /src | |
parent | d0c5df2695e6e30d600c0339f547373c0c6667b0 (diff) | |
download | ComputeLibrary-2b147ee857eb237613670460c52efedd43601955.tar.gz |
Avoid multiple Rhs matrix transformation on ClGemm
ClWinogradConv2d was performing Rhs transformation on every step
impacting the performance.
Adds scope logging support through ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME
Resolves: COMPMID-4596
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ib329d3bc8d8aa21abae9fabfe61de35cc84d4819
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5925
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/common/utils/Log.h | 14 | ||||
-rw-r--r-- | src/core/helpers/MemoryHelpers.h | 33 | ||||
-rw-r--r-- | src/graph/GraphManager.cpp | 7 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp | 11 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMM.cpp | 4 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMConv2d.cpp | 4 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp | 4 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp | 4 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp | 4 | ||||
-rw-r--r-- | src/runtime/gpu/cl/operators/ClGemm.cpp | 4 | ||||
-rw-r--r-- | src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp | 32 |
11 files changed, 91 insertions, 30 deletions
diff --git a/src/common/utils/Log.h b/src/common/utils/Log.h index 496ee74a16..cfbc95a627 100644 --- a/src/common/utils/Log.h +++ b/src/common/utils/Log.h @@ -43,6 +43,7 @@ #else /* ARM_COMPUTE_LOGGING_ENABLED */ #define ARM_COMPUTE_CREATE_ACL_LOGGER() #endif /* ARM_COMPUTE_LOGGING_ENABLED */ + /** Log a message to the logger * * @param[in] log_level Logging level @@ -54,6 +55,7 @@ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG("ComputeLibrary", log_level, msg); \ } while(false) + /** Log a message with format to the logger * * @param[in] log_level Logging level @@ -66,6 +68,7 @@ ARM_COMPUTE_CREATE_ACL_LOGGER(); \ ARM_COMPUTE_LOG_MSG_WITH_FORMAT("ComputeLibrary", log_level, fmt, __VA_ARGS__); \ } while(false) + /** Log an error message to the logger * * @param[in] msg Message to log @@ -88,4 +91,15 @@ ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \ } while(false) +/** Log an information message to the logger with function name before the message + * + * @param[in] msg Message to log + */ +#define ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(msg) \ + do \ + { \ + ARM_COMPUTE_CREATE_ACL_LOGGER(); \ + ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::INFO, msg); \ + } while(false) + #endif /* SRC_COMMON_LOG_H */ diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h index 619a4ec122..60a2dbfff7 100644 --- a/src/core/helpers/MemoryHelpers.h +++ b/src/core/helpers/MemoryHelpers.h @@ -41,7 +41,15 @@ inline int offset_int_vec(int offset) } template <typename TensorType> -using WorkspaceData = std::vector<std::pair<int, std::unique_ptr<TensorType>>>; +struct WorkspaceDataElement +{ + int slot{ -1 }; + experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary }; + std::unique_ptr<TensorType> tensor{ nullptr }; +}; + +template <typename TensorType> +using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>; template <typename TensorType> WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs, @@ -66,9 +74,9 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement } const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 }; - workspace_memory.emplace_back(req.slot, std::make_unique<TensorType>()); + workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() }); - auto aux_tensor = workspace_memory.back().second.get(); + auto aux_tensor = workspace_memory.back().tensor.get(); ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor); aux_tensor->allocator()->init(aux_info, req.alignment); @@ -85,11 +93,28 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement for(auto &mem : workspace_memory) { - auto tensor = mem.second.get(); + auto tensor = mem.tensor.get(); tensor->allocator()->allocate(); } return workspace_memory; } + +template <typename TensorType> +void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack) +{ + workspace.erase(std::remove_if(workspace.begin(), + workspace.end(), + [&prep_pack](auto & wk) + { + const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare; + if(to_erase) + { + prep_pack.remove_tensor(wk.slot); + } + return to_erase; + }), + workspace.end()); +} } // namespace arm_compute #endif /* SRC_COMMON_MEMORY_HELPERS_H */ diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp index e357f10401..45b608c70a 100644 --- a/src/graph/GraphManager.cpp +++ b/src/graph/GraphManager.cpp @@ -29,10 +29,11 @@ #include "arm_compute/graph/PassManager.h" #include "arm_compute/graph/TypePrinter.h" #include "arm_compute/graph/Utils.h" +#include "arm_compute/graph/algorithms/TopologicalSort.h" #include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h" #include "arm_compute/graph/detail/ExecutionHelpers.h" -#include "arm_compute/graph/algorithms/TopologicalSort.h" +#include "src/common/utils/Log.h" namespace arm_compute { @@ -45,6 +46,8 @@ GraphManager::GraphManager() void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target) { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph configuration!"); + // Check if graph has been registered if(_workloads.find(graph.id()) != std::end(_workloads)) { @@ -121,6 +124,8 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & void GraphManager::execute_graph(Graph &graph) { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph execution!"); + // Check if graph is finalized auto it = _workloads.find(graph.id()); ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!"); diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index f758c3d0b3..fa01c914c5 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -41,7 +41,6 @@ struct CLWinogradConvolutionLayer::Impl ICLTensor *dst{ nullptr }; std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr }; ITensorPack run_pack{}; - ITensorPack prep_pack{}; MemoryGroup memory_group{}; WorkspaceData<CLTensor> workspace_tensors{}; bool is_prepared{ false }; @@ -80,9 +79,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte { TensorType::ACL_SRC_2, _impl->biases }, { TensorType::ACL_DST, _impl->dst } }; - - _impl->prep_pack = { { TensorType::ACL_SRC_1, _impl->weights } }; - _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack); } Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, @@ -102,7 +99,11 @@ void CLWinogradConvolutionLayer::prepare() { if(!_impl->is_prepared) { - _impl->op->prepare(_impl->prep_pack); + _impl->op->prepare(_impl->run_pack); + + // Release Preparation tensors + release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack); + _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1); _impl->is_prepared = true; } } diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 168d93022f..4bf330fa1e 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -114,12 +114,12 @@ void NEGEMM::prepare() // Release temporary tensors that are only used in prepare stage for(auto &ws : _impl->workspace) { - const int slot = ws.first; + const int slot = ws.slot; for(auto &m : _impl->aux_mem_req) { if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare) { - auto tensor = ws.second.get(); + auto tensor = ws.tensor.get(); tensor->allocator()->free(); break; } diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 3ca5239ae3..7e2ce70444 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -104,12 +104,12 @@ void NEGEMMConv2d::prepare() // Release temporary tensors that are only used in prepare stage for(auto &ws : _impl->workspace) { - const int slot = ws.first; + const int slot = ws.slot; for(auto &m : _impl->aux_mem_req) { if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare) { - auto tensor = ws.second.get(); + auto tensor = ws.tensor.get(); tensor->allocator()->free(); break; } diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 6386a678db..23ffbce954 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -107,12 +107,12 @@ void NEGEMMConvolutionLayer::prepare() } for(auto &ws : _impl->workspace_tensors) { - const int slot = ws.first; + const int slot = ws.slot; for(auto &m : _impl->aux_mem_req) { if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare) { - auto tensor = ws.second.get(); + auto tensor = ws.tensor.get(); tensor->allocator()->free(); break; } diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 641a2c2b5f..64507495ca 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -110,12 +110,12 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() // Release temporary tensors that are only used in prepare stage for(auto &ws : _impl->workspace_tensors) { - const int slot = ws.first; + const int slot = ws.slot; for(auto &m : _impl->aux_mem_req) { if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare) { - auto tensor = ws.second.get(); + auto tensor = ws.tensor.get(); tensor->allocator()->free(); break; } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 745179c050..b91048a426 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -99,12 +99,12 @@ void NEWinogradConvolutionLayer::prepare() // Release temporary tensors that are only used in prepare stage for(auto &ws : _impl->workspace) { - const int slot = ws.first; + const int slot = ws.slot; for(auto &m : _impl->aux_mem_req) { if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare) { - auto tensor = ws.second.get(); + auto tensor = ws.tensor.get(); tensor->allocator()->free(); break; } diff --git a/src/runtime/gpu/cl/operators/ClGemm.cpp b/src/runtime/gpu/cl/operators/ClGemm.cpp index a80375447d..cb0eecae4b 100644 --- a/src/runtime/gpu/cl/operators/ClGemm.cpp +++ b/src/runtime/gpu/cl/operators/ClGemm.cpp @@ -37,6 +37,8 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" + +#include "src/common/utils/Log.h" #include "src/core/gpu/cl/IClKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" @@ -744,6 +746,8 @@ void ClGemm::prepare(ITensorPack &constants) // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!"); + CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp index c8db697778..2ca1ff59df 100644 --- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp +++ b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp @@ -212,9 +212,15 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso // Configure output transform _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); - _aux_mem = _batched_mm.workspace(); + _aux_mem = _batched_mm.workspace(); + const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r) + { + return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); + }) ? + MemoryLifetime::Prepare : + MemoryLifetime::Persistent; _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); - _aux_mem.push_back(MemoryInfo(offset_int_vec(3), MemoryLifetime::Persistent, _input1.total_size())); + _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); } @@ -229,7 +235,6 @@ void ClWinogradConv2d::run(ITensorPack &tensors) { prepare(tensors); - // Run input transform auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); @@ -238,6 +243,7 @@ void ClWinogradConv2d::run(ITensorPack &tensors) CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true); CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true); + // Run input transform ITensorPack pack_it { { TensorType::ACL_SRC, src }, @@ -247,12 +253,17 @@ void ClWinogradConv2d::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_input_transform, pack_it); // Run batched matrix multiplication - ITensorPack pack_mm + ITensorPack pack_mm = tensors; + pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get()); + pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get()); + if(_aux_mem[3].lifetime == MemoryLifetime::Prepare) { - { TensorType::ACL_SRC_0, input0.get() }, - { TensorType::ACL_SRC_1, input1.get() }, - { TensorType::ACL_DST, batched_mm_output.get() }, - }; + pack_mm.remove_tensor(TensorType::ACL_SRC_1); + } + else + { + pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); + } _batched_mm.run(pack_mm); // Run output transform @@ -282,9 +293,10 @@ void ClWinogradConv2d::prepare(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); weights->mark_as_unused(); - tensors.add_tensor(ACL_SRC_1, input1.get()); // Prepare GEMM and release reshaped weights if marked unused by ClGemm - _batched_mm.prepare(tensors); + ITensorPack mm_prepare_pack = tensors; + mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get()); + _batched_mm.prepare(mm_prepare_pack); CLScheduler::get().queue().finish(); _is_prepared = true; |