From a08d29b815987e98e7f45519e6a55eee0f085e5f Mon Sep 17 00:00:00 2001 From: Derek Lamberti Date: Fri, 19 Jun 2020 14:33:05 +0100 Subject: Minor improvement of inference profiling * Start inference profiling at the actual beginning * Add profiling events for EnqueueInputs and EnqueueOutputs * Add profiling event for working memory allocation * Refactor Execute body to remove code duplication * forward arguments to constructors rather than copy Change-Id: Iacab85f0a02e88e2423885f86f97e4dba4037319 Signed-off-by: Derek Lamberti --- src/armnn/LoadedNetwork.cpp | 89 ++++++++++++++++++++------------------------- src/armnn/LoadedNetwork.hpp | 2 +- src/armnn/Profiling.hpp | 10 ++--- src/armnn/Runtime.cpp | 1 + 4 files changed, 46 insertions(+), 56 deletions(-) (limited to 'src/armnn') diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index fbf8cfbb4c..b35dfd1107 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -451,8 +451,6 @@ private: Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors) { - ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload"); - const Graph& graph = m_OptimizedNetwork->GetGraph(); // Walk graph to determine the order of execution. @@ -471,21 +469,27 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, } // For each input to the network, call EnqueueInput with the data passed by the user. - m_InputQueue.clear(); - m_InputQueue.reserve(graph.GetNumInputs()); - for (const BindableLayer* inputLayer : graph.GetInputLayers()) { - const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); - EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); + m_InputQueue.clear(); + m_InputQueue.reserve(graph.GetNumInputs()); + for (const BindableLayer* inputLayer : graph.GetInputLayers()) + { + const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); + EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + } } // For each output to the network, call EnqueueOutput with the data passed by the user. - m_OutputQueue.clear(); - m_OutputQueue.reserve(graph.GetNumOutputs()); - for (const BindableLayer* outputLayer : graph.GetOutputLayers()) { - const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId()); - EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); + m_OutputQueue.clear(); + m_OutputQueue.reserve(graph.GetNumOutputs()); + for (const BindableLayer* outputLayer : graph.GetOutputLayers()) + { + const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId()); + EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + } } std::unique_ptr timelineUtils = @@ -684,8 +688,13 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten } } -void LoadedNetwork::AllocateWorkingMemory() +void LoadedNetwork::AllocateWorkingMemory(std::lock_guard& lock) { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation"); + + // this unused parameter makes sure we can only call this function with a valid lock + IgnoreUnused(lock); + if (m_IsWorkingMemAllocated) { return; @@ -736,49 +745,29 @@ bool LoadedNetwork::Execute(std::unique_ptr& timelineUti try { std::lock_guard lockGuard(m_WorkingMemMutex); - AllocateWorkingMemory(); + AllocateWorkingMemory(lockGuard); ProfilingDynamicGuid workloadInferenceID(0); - for (auto& input : m_InputQueue) + auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue) { - if(timelineUtils) + for (auto& workload : queue) { - workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(input->GetGuid(), - inferenceGuid); - } - input->Execute(); - if(timelineUtils) - { - timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); + if(timelineUtils) + { + workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), + inferenceGuid); + } + workload->Execute(); + if(timelineUtils) + { + timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); + } } - } + }; - for (auto& workload : m_WorkloadQueue) - { - if(timelineUtils) - { - workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), - inferenceGuid); - } - workload->Execute(); - if(timelineUtils) - { - timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); - } - } - for (auto& output: m_OutputQueue) - { - if(timelineUtils) - { - workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(output->GetGuid(), - inferenceGuid); - } - output->Execute(); - if(timelineUtils) - { - timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); - } - } + ExecuteQueue(m_InputQueue); + ExecuteQueue(m_WorkloadQueue); + ExecuteQueue(m_OutputQueue); } catch (const RuntimeException& error) { diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index 918375ac38..8c2103019e 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -59,7 +59,7 @@ public: void SendNetworkStructure(); private: - void AllocateWorkingMemory(); + void AllocateWorkingMemory(std::lock_guard& lock); LoadedNetwork(std::unique_ptr net, const INetworkProperties& networkProperties, diff --git a/src/armnn/Profiling.hpp b/src/armnn/Profiling.hpp index 08d7f7ba21..08e55a14c5 100644 --- a/src/armnn/Profiling.hpp +++ b/src/armnn/Profiling.hpp @@ -115,7 +115,7 @@ public: using InstrumentPtr = std::unique_ptr; template - ScopedProfilingEvent(const BackendId& backendId, const std::string& name, Args... args) + ScopedProfilingEvent(const BackendId& backendId, const std::string& name, Args&&... args) : m_Event(nullptr) , m_Profiler(ProfilerManager::GetInstance().GetProfiler()) { @@ -123,7 +123,7 @@ public: { std::vector instruments(0); instruments.reserve(sizeof...(args)); //One allocation - ConstructNextInVector(instruments, args...); + ConstructNextInVector(instruments, std::forward(args)...); m_Event = m_Profiler->BeginEvent(backendId, name, std::move(instruments)); } } @@ -144,10 +144,10 @@ private: } template - void ConstructNextInVector(std::vector& instruments, Arg arg, Args... args) + void ConstructNextInVector(std::vector& instruments, Arg&& arg, Args&&... args) { - instruments.emplace_back(std::make_unique(arg)); - ConstructNextInVector(instruments, args...); + instruments.emplace_back(std::make_unique(std::forward(arg))); + ConstructNextInVector(instruments, std::forward(args)...); } Event* m_Event; ///< Event to track diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index 5692494836..28e2df22ab 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -308,6 +308,7 @@ Status Runtime::EnqueueWorkload(NetworkId networkId, const InputTensors& inputTensors, const OutputTensors& outputTensors) { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload"); LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); static thread_local NetworkId lastId = networkId; -- cgit v1.2.1