Avoid multiple Rhs matrix transformation on ClGemm

ClWinogradConv2d was performing Rhs transformation on every step impacting the performance. Adds scope logging support through ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME Resolves: COMPMID-4596 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: Ib329d3bc8d8aa21abae9fabfe61de35cc84d4819 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5925 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2021-07-08 18:14:45 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-07-16 14:39:47 +0000
commit: 2b147ee857eb237613670460c52efedd43601955 (patch)
tree: 2c2f66754dca6d83e4967daae600e84bca8ca6b4
parent: d0c5df2695e6e30d600c0339f547373c0c6667b0 (diff)
download: ComputeLibrary-2b147ee857eb237613670460c52efedd43601955.tar.gz
12 files changed, 106 insertions, 31 deletions
diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
index 1108dd3800..4900bf9e6b 100644
--- a/arm_compute/core/utils/logging/Macros.h
+++ b/arm_compute/core/utils/logging/Macros.h
@@ -30,6 +30,20 @@
 
 #ifdef ARM_COMPUTE_LOGGING_ENABLED
 
+#ifdef __GNUC__
+inline std::string signature_name(const std::string &pretty_func)
+{
+    const auto scope_op = pretty_func.find("::");
+    const auto begin    = pretty_func.substr(0, scope_op).rfind(" ") + 1;
+    const auto end      = pretty_func.rfind("(") - begin;
+
+    return pretty_func.substr(begin, end) + "()";
+}
+#define ARM_COMPUTE_SIGNATURE_NAME signature_name(__PRETTY_FUNCTION__)
+#else /* __GNUC__ */
+#define ARM_COMPUTE_SIGNATURE_NAME (__func__)
+#endif /* __GNUC__ */
+
 #define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)                                 \
     do                                                                                   \
     {                                                                                    \
@@ -47,7 +61,7 @@
         if(__logger != nullptr)                                                          \
         {                                                                                \
             std::ostringstream s;                                                        \
-            s << __func__ << ":" << msg;                                                 \
+            s << ARM_COMPUTE_SIGNATURE_NAME << " : " << msg;                             \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
     } while(false)
diff --git a/src/common/utils/Log.h b/src/common/utils/Log.h
index 496ee74a16..cfbc95a627 100644
--- a/src/common/utils/Log.h
+++ b/src/common/utils/Log.h
@@ -43,6 +43,7 @@
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_ACL_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
+
 /** Log a message to the logger
  *
  * @param[in] log_level Logging level
@@ -54,6 +55,7 @@
         ARM_COMPUTE_CREATE_ACL_LOGGER();                       \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", log_level, msg); \
     } while(false)
+
 /** Log a message with format to the logger
  *
  * @param[in] log_level Logging level
@@ -66,6 +68,7 @@
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("ComputeLibrary", log_level, fmt, __VA_ARGS__); \
     } while(false)
+
 /** Log an error message to the logger
  *
  * @param[in] msg Message to log
@@ -88,4 +91,15 @@
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
     } while(false)
 
+/** Log an information message to the logger with function name before the message
+ *
+ * @param[in] msg Message to log
+ */
+#define ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(msg)                                                     \
+    do                                                                                                  \
+    {                                                                                                   \
+        ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                \
+        ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::INFO, msg); \
+    } while(false)
+
 #endif /* SRC_COMMON_LOG_H */
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index 619a4ec122..60a2dbfff7 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -41,7 +41,15 @@ inline int offset_int_vec(int offset)
 }
 
 template <typename TensorType>
-using WorkspaceData = std::vector<std::pair<int, std::unique_ptr<TensorType>>>;
+struct WorkspaceDataElement
+{
+    int                          slot{ -1 };
+    experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary };
+    std::unique_ptr<TensorType>  tensor{ nullptr };
+};
+
+template <typename TensorType>
+using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>;
 
 template <typename TensorType>
 WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
@@ -66,9 +74,9 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
         }
 
         const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 };
-        workspace_memory.emplace_back(req.slot, std::make_unique<TensorType>());
+        workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() });
 
-        auto aux_tensor = workspace_memory.back().second.get();
+        auto aux_tensor = workspace_memory.back().tensor.get();
         ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
         aux_tensor->allocator()->init(aux_info, req.alignment);
 
@@ -85,11 +93,28 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 
     for(auto &mem : workspace_memory)
     {
-        auto tensor = mem.second.get();
+        auto tensor = mem.tensor.get();
         tensor->allocator()->allocate();
     }
 
     return workspace_memory;
 }
+
+template <typename TensorType>
+void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack)
+{
+    workspace.erase(std::remove_if(workspace.begin(),
+                                   workspace.end(),
+                                   [&prep_pack](auto & wk)
+    {
+        const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
+        if(to_erase)
+        {
+            prep_pack.remove_tensor(wk.slot);
+        }
+        return to_erase;
+    }),
+    workspace.end());
+}
 } // namespace arm_compute
 #endif /* SRC_COMMON_MEMORY_HELPERS_H */
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index e357f10401..45b608c70a 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/graph/PassManager.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
 #include "arm_compute/graph/detail/ExecutionHelpers.h"
 
-#include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "src/common/utils/Log.h"
 
 namespace arm_compute
 {
@@ -45,6 +46,8 @@ GraphManager::GraphManager()
 
 void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target)
 {
+    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph configuration!");
+
     // Check if graph has been registered
     if(_workloads.find(graph.id()) != std::end(_workloads))
     {
@@ -121,6 +124,8 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
 
 void GraphManager::execute_graph(Graph &graph)
 {
+    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph execution!");
+
     // Check if graph is finalized
     auto it = _workloads.find(graph.id());
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index f758c3d0b3..fa01c914c5 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -41,7 +41,6 @@ struct CLWinogradConvolutionLayer::Impl
     ICLTensor                                *dst{ nullptr };
     std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
     ITensorPack                               run_pack{};
-    ITensorPack                               prep_pack{};
     MemoryGroup                               memory_group{};
     WorkspaceData<CLTensor>                   workspace_tensors{};
     bool                                      is_prepared{ false };
@@ -80,9 +79,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
         { TensorType::ACL_SRC_2, _impl->biases },
         { TensorType::ACL_DST, _impl->dst }
     };
-
-    _impl->prep_pack         = { { TensorType::ACL_SRC_1, _impl->weights } };
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
 Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -102,7 +99,11 @@ void CLWinogradConvolutionLayer::prepare()
 {
     if(!_impl->is_prepared)
     {
-        _impl->op->prepare(_impl->prep_pack);
+        _impl->op->prepare(_impl->run_pack);
+
+        // Release Preparation tensors
+        release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack);
+        _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1);
         _impl->is_prepared = true;
     }
 }
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 168d93022f..4bf330fa1e 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -114,12 +114,12 @@ void NEGEMM::prepare()
         // Release temporary tensors that are only used in prepare stage
         for(auto &ws : _impl->workspace)
         {
-            const int slot = ws.first;
+            const int slot = ws.slot;
             for(auto &m : _impl->aux_mem_req)
             {
                 if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
                 {
-                    auto tensor = ws.second.get();
+                    auto tensor = ws.tensor.get();
                     tensor->allocator()->free();
                     break;
                 }
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 3ca5239ae3..7e2ce70444 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -104,12 +104,12 @@ void NEGEMMConv2d::prepare()
         // Release temporary tensors that are only used in prepare stage
         for(auto &ws : _impl->workspace)
         {
-            const int slot = ws.first;
+            const int slot = ws.slot;
             for(auto &m : _impl->aux_mem_req)
             {
                 if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
                 {
-                    auto tensor = ws.second.get();
+                    auto tensor = ws.tensor.get();
                     tensor->allocator()->free();
                     break;
                 }
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 6386a678db..23ffbce954 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -107,12 +107,12 @@ void NEGEMMConvolutionLayer::prepare()
         }
         for(auto &ws : _impl->workspace_tensors)
         {
-            const int slot = ws.first;
+            const int slot = ws.slot;
             for(auto &m : _impl->aux_mem_req)
             {
                 if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
                 {
-                    auto tensor = ws.second.get();
+                    auto tensor = ws.tensor.get();
                     tensor->allocator()->free();
                     break;
                 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 641a2c2b5f..64507495ca 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -110,12 +110,12 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
         // Release temporary tensors that are only used in prepare stage
         for(auto &ws : _impl->workspace_tensors)
         {
-            const int slot = ws.first;
+            const int slot = ws.slot;
             for(auto &m : _impl->aux_mem_req)
             {
                 if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
                 {
-                    auto tensor = ws.second.get();
+                    auto tensor = ws.tensor.get();
                     tensor->allocator()->free();
                     break;
                 }
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 745179c050..b91048a426 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -99,12 +99,12 @@ void NEWinogradConvolutionLayer::prepare()
         // Release temporary tensors that are only used in prepare stage
         for(auto &ws : _impl->workspace)
         {
-            const int slot = ws.first;
+            const int slot = ws.slot;
             for(auto &m : _impl->aux_mem_req)
             {
                 if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
                 {
-                    auto tensor = ws.second.get();
+                    auto tensor = ws.tensor.get();
                     tensor->allocator()->free();
                     break;
                 }
diff --git a/src/runtime/gpu/cl/operators/ClGemm.cpp b/src/runtime/gpu/cl/operators/ClGemm.cpp
index a80375447d..cb0eecae4b 100644
--- a/src/runtime/gpu/cl/operators/ClGemm.cpp
+++ b/src/runtime/gpu/cl/operators/ClGemm.cpp
@@ -37,6 +37,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/gpu/cl/IClKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
@@ -744,6 +746,8 @@ void ClGemm::prepare(ITensorPack &constants)
     // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
     if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
     {
+        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
+
         CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
         ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
 
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
index c8db697778..2ca1ff59df 100644
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -212,9 +212,15 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso
     // Configure output transform
     _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
 
-    _aux_mem = _batched_mm.workspace();
+    _aux_mem                             = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
+    {
+        return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
+    }) ?
+    MemoryLifetime::Prepare :
+    MemoryLifetime::Persistent;
     _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
-    _aux_mem.push_back(MemoryInfo(offset_int_vec(3), MemoryLifetime::Persistent, _input1.total_size()));
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
 }
 
@@ -229,7 +235,6 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
 {
     prepare(tensors);
 
-    // Run input transform
     auto src    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
     auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
     auto dst    = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
@@ -238,6 +243,7 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true);
     CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
 
+    // Run input transform
     ITensorPack pack_it
     {
         { TensorType::ACL_SRC, src },
@@ -247,12 +253,17 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     CLScheduler::get().enqueue_op(*_input_transform, pack_it);
 
     // Run batched matrix multiplication
-    ITensorPack pack_mm
+    ITensorPack pack_mm = tensors;
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
+    pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
+    if(_aux_mem[3].lifetime == MemoryLifetime::Prepare)
     {
-        { TensorType::ACL_SRC_0, input0.get() },
-        { TensorType::ACL_SRC_1, input1.get() },
-        { TensorType::ACL_DST, batched_mm_output.get() },
-    };
+        pack_mm.remove_tensor(TensorType::ACL_SRC_1);
+    }
+    else
+    {
+        pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    }
     _batched_mm.run(pack_mm);
 
     // Run output transform
@@ -282,9 +293,10 @@ void ClWinogradConv2d::prepare(ITensorPack &tensors)
         CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
         weights->mark_as_unused();
 
-        tensors.add_tensor(ACL_SRC_1, input1.get());
         // Prepare GEMM and release reshaped weights if marked unused by ClGemm
-        _batched_mm.prepare(tensors);
+        ITensorPack mm_prepare_pack = tensors;
+        mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get());
+        _batched_mm.prepare(mm_prepare_pack);
 
         CLScheduler::get().queue().finish();
         _is_prepared = true;
author	Georgios Pinitas <georgios.pinitas@arm.com>	2021-07-08 18:14:45 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-07-16 14:39:47 +0000
commit	2b147ee857eb237613670460c52efedd43601955 (patch)
tree	2c2f66754dca6d83e4967daae600e84bca8ca6b4
parent	d0c5df2695e6e30d600c0339f547373c0c6667b0 (diff)
download	ComputeLibrary-2b147ee857eb237613670460c52efedd43601955.tar.gz