aboutsummaryrefslogtreecommitdiff
path: root/src/armnn/Profiling.cpp
diff options
context:
space:
mode:
authortelsoa01 <telmo.soares@arm.com>2018-08-31 09:22:23 +0100
committertelsoa01 <telmo.soares@arm.com>2018-08-31 09:22:23 +0100
commitc577f2c6a3b4ddb6ba87a882723c53a248afbeba (patch)
treebd7d4c148df27f8be6649d313efb24f536b7cf34 /src/armnn/Profiling.cpp
parent4c7098bfeab1ffe1cdc77f6c15548d3e73274746 (diff)
downloadarmnn-c577f2c6a3b4ddb6ba87a882723c53a248afbeba.tar.gz
Release 18.08
Diffstat (limited to 'src/armnn/Profiling.cpp')
-rw-r--r--src/armnn/Profiling.cpp455
1 files changed, 346 insertions, 109 deletions
diff --git a/src/armnn/Profiling.cpp b/src/armnn/Profiling.cpp
index 15a195e6bd..f70f6a34d1 100644
--- a/src/armnn/Profiling.cpp
+++ b/src/armnn/Profiling.cpp
@@ -3,8 +3,7 @@
// See LICENSE file in the project root for full license information.
//
#include "Profiling.hpp"
-
-#if ARMNN_PROFILING_ENABLED
+#include "JsonPrinter.hpp"
#if ARMNN_STREAMLINE_ENABLED
#include <streamline_annotate.h>
@@ -17,10 +16,12 @@
#include <algorithm>
#include <iomanip>
#include <iostream>
+#include <fstream>
#include <map>
#include <stack>
-#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/core/ignore_unused.hpp>
namespace armnn
{
@@ -32,86 +33,128 @@ constexpr std::size_t g_ProfilingEventCountHint = 1024;
// Whether profiling reports should include the sequence of events together with their timings.
constexpr bool g_WriteProfilingEventSequence = true;
-// Whether profiling reports should also report detailed information on events grouped by tag.
-// This is used to group stats per inference (see usage of ARMNN_UPDATE_PROFILING_EVENT_TAG in
-// Runtime::EnqueueWorkload). This can spam the output stream, so use carefully (or adapt
-// the code to just output information for a tag of interest).
-constexpr bool g_AggregateProfilingEventsByTag = false;
+// Whether profiling reports should also report detailed information on events grouped by inference.
+// This can spam the output stream, so use carefully (or adapt the code to just output information
+// of interest).
+constexpr bool g_AggregateProfilingEventsByInference = true;
-// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler
-// singleton is destroyed. It can be convenient for local tests.
-constexpr bool g_WriteReportToStdOutOnProfilerDestruction = true;
+// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed.
+// It can be convenient for local tests.
+constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false;
// Whether events denoting operations running on the GPU should force a sync before/after the event.
// This is hardcoded to true for now as the profiling timings are not very useful without it.
+#if ARMCOMPUTECL_ENABLED
constexpr bool g_ProfilingForceGpuSync = true;
+#endif
+
+Measurement FindMeasurement(const std::string& name, const Event* event)
+{
+
+ BOOST_ASSERT(event != nullptr);
+
+ // Search though the measurements.
+ for (const auto& measurement : event->GetMeasurements())
+ {
+ if (measurement.m_Name == name)
+ {
+ // Measurement found.
+ return measurement;
+ }
+ }
+
+ // Measurement not found.
+ return Measurement{ "", 0.f, Measurement::Unit::TIME_MS };
+}
+
+std::vector<Measurement> FindKernelMeasurements(const Event* event)
+{
+ BOOST_ASSERT(event != nullptr);
+
+ std::vector<Measurement> measurements;
+
+ // Search through the measurements.
+ for (const auto& measurement : event->GetMeasurements())
+ {
+ if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0
+ || measurement.m_Name.rfind("NeonKernelTimer", 0) == 0)
+ {
+ // Measurement found.
+ measurements.push_back(measurement);
+ }
+ }
+
+ return measurements;
+}
std::map<std::string, Profiler::ProfilingEventStats> Profiler::CalculateProfilingEventStats() const
{
std::map<std::string, ProfilingEventStats> nameToStatsMap;
- for (auto&& event : m_EventSequence)
+ for (const auto& event : m_EventSequence)
{
- auto mapIter = nameToStatsMap.find(event.m_Label);
- if (mapIter != nameToStatsMap.end())
+ Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get());
+
+ double durationMs = measurement.m_Value;
+ auto it = nameToStatsMap.find(event->GetName());
+ if (it != nameToStatsMap.end())
{
- ProfilingEventStats& stats = mapIter->second;
- stats.m_TotalMs += event.DurationMs();
- stats.m_MinMs = std::min(stats.m_MinMs, event.DurationMs());
- stats.m_MaxMs = std::max(stats.m_MaxMs, event.DurationMs());
+ ProfilingEventStats& stats = it->second;
+ stats.m_TotalMs += durationMs;
+ stats.m_MinMs = std::min(stats.m_MinMs, durationMs);
+ stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs);
++stats.m_Count;
}
else
{
- ProfilingEventStats stats;
- stats.m_TotalMs = event.DurationMs();
- stats.m_MinMs = event.DurationMs();
- stats.m_MaxMs = event.DurationMs();
- stats.m_Count = 1;
-
- nameToStatsMap[event.m_Label] = stats;
+ nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 });
}
}
return nameToStatsMap;
}
-void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>::const_iterator first,
- std::vector<ProfilingEvent>::const_iterator last,
- std::ostream& outStream) const
+const Event* GetEventPtr(const Event* ptr) { return ptr;}
+const Event* GetEventPtr(const std::unique_ptr<Event>& ptr) {return ptr.get(); }
+
+template<typename ItertType>
+void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const
{
- // Output event sequence, if needed
+ // Outputs event sequence, if needed.
if (g_WriteProfilingEventSequence)
{
- // Make sure timestamps are output with 6 decimals, and save old settings
+ // Makes sure timestamps are output with 6 decimals, and save old settings.
std::streamsize oldPrecision = outStream.precision();
outStream.precision(6);
std::ios_base::fmtflags oldFlags = outStream.flags();
outStream.setf(std::ios::fixed);
- // Output fields
+ // Outputs fields.
outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl;
for (auto event = first; event != last; ++event)
{
- std::chrono::duration<double, std::milli> startTimeMs = event->m_StartTime.time_since_epoch();
- std::chrono::duration<double, std::milli> stopTimeMs = event->m_StopTime.time_since_epoch();
-
- outStream << std::setw(50) << event->m_Label << " "
- << std::setw(20) << event->DurationMs()
- << std::setw(20) << startTimeMs.count()
- << std::setw(20) << stopTimeMs.count()
- << std::setw(20) << Profiler::Get().GetEventComputeDevice(event->m_Device)
- << std::endl;
+ const Event* eventPtr = GetEventPtr((*event));
+ double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value;
+ double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value;
+
+ // Find the WallClock measurement if there is one.
+ double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value;
+ outStream << std::setw(50) << eventPtr->GetName() << " "
+ << std::setw(20) << durationMs
+ << std::setw(20) << startTimeMs
+ << std::setw(20) << stopTimeMs
+ << std::setw(20) << GetComputeDeviceAsCString(eventPtr->GetComputeDevice())
+ << std::endl;
}
outStream << std::endl;
- // Restore previous precision settings
+ // Restores previous precision settings.
outStream.flags(oldFlags);
outStream.precision(oldPrecision);
}
- // Aggregate results per event name
+ // Aggregates results per event name.
std::map<std::string, ProfilingEventStats> nameToStatsMap = CalculateProfilingEventStats();
- // Output aggregated stats
+ // Outputs aggregated stats.
outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl;
for (const auto& pair : nameToStatsMap)
{
@@ -126,74 +169,236 @@ void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>::
outStream << std::endl;
}
-Profiler Profiler::s_Instance;
-
Profiler::Profiler()
- : m_EventTag(0)
- , m_NestingLevel(0)
- , m_EventTagUpdated(false)
+ : m_ProfilingEnabled(false)
{
m_EventSequence.reserve(g_ProfilingEventCountHint);
#if ARMNN_STREAMLINE_ENABLED
- // Initialise streamline annotations
+ // Initialises streamline annotations.
ANNOTATE_SETUP;
#endif
}
Profiler::~Profiler()
{
- if (g_WriteReportToStdOutOnProfilerDestruction)
+ if (m_ProfilingEnabled)
{
- AnalyzeEventsAndWriteResults(std::cout);
+ if (g_WriteReportToStdOutOnProfilerDestruction)
+ {
+ Print(std::cout);
+ }
}
+
+ // Un-register this profiler from the current thread.
+ ProfilerManager::GetInstance().RegisterProfiler(nullptr);
}
-void Profiler::BeginEvent(Compute compute, const std::string label)
+bool Profiler::IsProfilingEnabled()
+{
+ return m_ProfilingEnabled;
+}
+
+void Profiler::EnableProfiling(bool enableProfiling)
+{
+ m_ProfilingEnabled = enableProfiling;
+}
+
+Event* Profiler::BeginEvent(Compute compute, const std::string& label, std::vector<InstrumentPtr>&& instruments)
{
// We need to sync just before the begin event to not include time before the period we want to time.
WaitForDevice(compute);
- const TimePoint timeStamp = Clock::now();
- m_ObservedMarkers.emplace(Marker{m_EventSequence.size(), label, timeStamp, compute, m_EventTag});
- m_EventSequence.emplace_back();
+ Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
+ m_EventSequence.push_back(std::make_unique<Event>(label, this, parent, compute, std::move(instruments)));
+ Event* event = m_EventSequence.back().get();
+ event->Start();
#if ARMNN_STREAMLINE_ENABLED
- ANNOTATE_CHANNEL_COLOR(m_NestingLevel, GetEventColor(compute), label.c_str());
+ ANNOTATE_CHANNEL_COLOR(m_Parents.size(), GetEventColor(compute), label.c_str());
#endif
- m_NestingLevel++;
+ m_Parents.push(event);
+ return event;
}
-void Profiler::EndEvent(Compute compute)
+void Profiler::EndEvent(Event* event)
{
- // We need to sync just before the end event to include all the time of the timed period.
- WaitForDevice(compute);
-
- const Marker& marker = m_ObservedMarkers.top();
+ event->Stop();
- const TimePoint startTime = marker.m_TimeStamp;
- const TimePoint stopTime = Clock::now();
+ BOOST_ASSERT(!m_Parents.empty());
+ BOOST_ASSERT(event == m_Parents.top());
+ m_Parents.pop();
- m_EventSequence[marker.m_Id] = {std::move(marker.m_EventName),
- startTime,
- stopTime,
- marker.m_ComputeDevice,
- marker.m_Tag};
-
- m_ObservedMarkers.pop();
+ Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
+ boost::ignore_unused(parent);
+ BOOST_ASSERT(event->GetParentEvent() == parent);
#if ARMNN_STREAMLINE_ENABLED
- ANNOTATE_CHANNEL_END(m_NestingLevel);
+ ANNOTATE_CHANNEL_END(m_Parents.size());
#endif
+}
+
+int CalcLevel(const Event* eventPtr)
+{
+ int level=0;
+ while (eventPtr != nullptr)
+ {
+ eventPtr = eventPtr->GetParentEvent();
+ level++;
+ }
+ return level;
+}
+
+void Profiler::PopulateInferences(std::vector<const Event*>& outInferences, int& outBaseLevel) const
+{
+ outInferences.reserve(m_EventSequence.size());
+ for (const auto& event : m_EventSequence)
+ {
+ const Event* eventPtrRaw = event.get();
+ if (eventPtrRaw->GetName() == "EnqueueWorkload")
+ {
+ outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel;
+ outInferences.push_back(eventPtrRaw);
+ }
+ }
+}
+
+void Profiler::PopulateDescendants(std::map<const Event*, std::vector<const Event*>>& outDescendantsMap) const
+{
+ for (const auto& event : m_EventSequence)
+ {
+ const Event* eventPtrRaw = event.get();
+ const Event* parent = eventPtrRaw->GetParentEvent();
+
+ if (!parent)
+ {
+ continue;
+ }
+
+ auto it = outDescendantsMap.find(parent);
+ if (it == outDescendantsMap.end())
+ {
+ outDescendantsMap.emplace(parent, std::vector<const Event*>({eventPtrRaw}));
+ }
+ else
+ {
+ it->second.push_back(eventPtrRaw);
+ }
+ }
+}
+
+void Profiler::Print(std::ostream& outStream) const
+{
+ // Makes sure timestamps are output with 6 decimals, and save old settings.
+ std::streamsize oldPrecision = outStream.precision();
+ outStream.precision(6);
+ std::ios_base::fmtflags oldFlags = outStream.flags();
+ outStream.setf(std::ios::fixed);
+ JsonPrinter printer(outStream);
+
+ // First find all the "inference" Events and print out duration measurements.
+ int baseLevel = -1;
+ std::vector<const Event*> inferences;
+ PopulateInferences(inferences, baseLevel);
+
+ // Second map out descendants hierarchy
+ std::map<const Event*, std::vector<const Event*>> descendantsMap;
+ PopulateDescendants(descendantsMap);
+
+ JsonChildObject inferenceObject{"inference_measurements"};
+ JsonChildObject layerObject{"layer_measurements"};
+ std::vector<JsonChildObject> workloadObjects;
+ std::map<unsigned int, std::vector<JsonChildObject>> workloadToKernelObjects;
+
+ for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex)
+ {
+ auto inference = inferences[inferenceIndex];
+ Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, inference);
+ inferenceObject.SetUnit(measurement.m_Unit);
+ inferenceObject.AddMeasurement(measurement.m_Value);
+
+ auto layerEventsIt = descendantsMap.find(inference);
+
+ // Assuming 1 Execute per inference
+ if (layerEventsIt != descendantsMap.end())
+ {
+ auto layerEvent = layerEventsIt->second[0];
+ Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, layerEvent);
+ layerObject.SetUnit(measurement.m_Unit);
+ layerObject.AddMeasurement(measurement.m_Value);
+
+ // Get Descendant Events for Execute
+ auto workloadEventsIt = descendantsMap.find(layerEvent);
+ for(unsigned int workloadIndex = 0; workloadIndex < workloadEventsIt->second.size(); ++workloadIndex)
+ {
+ auto workloadEvent = workloadEventsIt->second[workloadIndex];
+ Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, workloadEvent);
+ std::vector<Measurement> kernelMeasurements = FindKernelMeasurements(workloadEvent);
+ if (inferenceIndex == 0)
+ {
+ // Only add second level once, in case of multiple inferences
+ JsonChildObject workloadObject{workloadEvent->GetName()};
+ workloadObject.SetUnit(measurement.m_Unit);
+ workloadObjects.push_back(workloadObject);
+ }
+ workloadObjects[workloadIndex].AddMeasurement(measurement.m_Value);
+
+ for(unsigned int kernelIndex = 0; kernelIndex < kernelMeasurements.size(); ++kernelIndex)
+ {
+ if (inferenceIndex == 0)
+ {
+ // Only add kernel measurement once, in case of multiple inferences
+ JsonChildObject kernelObject{kernelMeasurements[kernelIndex].m_Name};
+ kernelObject.SetUnit(kernelMeasurements[kernelIndex].m_Unit);
+ workloadToKernelObjects[workloadIndex].push_back(kernelObject);
+
+ }
+ workloadToKernelObjects[workloadIndex][kernelIndex].
+ AddMeasurement(kernelMeasurements[kernelIndex].m_Value);
+ }
+ }
+ }
+ }
+
+ for (auto workloadToKernelPair : workloadToKernelObjects)
+ {
+ for (auto kernelObject : workloadToKernelPair.second)
+ {
+ workloadObjects[workloadToKernelPair.first].AddChild(kernelObject);
+ }
+ }
- m_NestingLevel--;
+ for (auto workloadObject : workloadObjects)
+ {
+ layerObject.AddChild(workloadObject);
+ }
+ inferenceObject.AddChild(layerObject);
+
+ printer.PrintHeader();
+ printer.PrintArmNNHeader();
+
+ // print inference object, also prints child layer and kernel measurements
+ printer.PrintJsonChildObject(inferenceObject);
+
+ // end of ArmNN
+ printer.PrintNewLine();
+ printer.PrintFooter();
+
+ // end of main JSON object
+ printer.PrintNewLine();
+ printer.PrintFooter();
+ printer.PrintNewLine();
+
+ // Restores previous precision settings.
+ outStream.flags(oldFlags);
+ outStream.precision(oldPrecision);
}
void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const
{
// Stack should be empty now.
- const bool saneMarkerSequence = m_ObservedMarkers.empty();
+ const bool saneMarkerSequence = m_Parents.empty();
// Abort if the sequence of markers was found to have incorrect information:
// The stats cannot be trusted.
@@ -206,39 +411,69 @@ void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const
return;
}
- // Analyze the full sequence of events
- AnalyzeEventSequenceAndWriteResults(m_EventSequence.begin(), m_EventSequence.end(), outStream);
+ // Analyzes the full sequence of events.
+ AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(),
+ m_EventSequence.cend(),
+ outStream);
- // Aggregate events by tag if requested (spams the output stream if done for all tags)
- if (m_EventTagUpdated && g_AggregateProfilingEventsByTag)
+ // Aggregates events by tag if requested (spams the output stream if done for all tags).
+ if (g_AggregateProfilingEventsByInference)
{
outStream << std::endl;
outStream << "***" << std::endl;
- outStream << "*** Per Tag Stats" << std::endl;
+ outStream << "*** Per Inference Stats" << std::endl;
outStream << "***" << std::endl;
outStream << std::endl;
- for (auto iter = m_EventSequence.begin(); iter != m_EventSequence.end();)
- {
- const uint32_t tag = iter->m_Tag;
+ int baseLevel = -1;
+ std::vector<const Event*> inferences;
+ PopulateInferences(inferences, baseLevel);
- // Advance iter until we find the first non-matching tag
- auto tagEndIter = iter;
- for (; tagEndIter != m_EventSequence.end(); ++tagEndIter)
+ // Second map out descendants hierarchy
+ std::map<const Event*, std::vector<const Event*>> descendantsMap;
+ PopulateDescendants(descendantsMap);
+
+ std::function<void (const Event*, std::vector<const Event*>&)>
+ FindDescendantEvents = [&](const Event* eventPtr,
+ std::vector<const Event*>& sequence)
{
- if (tagEndIter->m_Tag != tag)
+ sequence.push_back(eventPtr);
+
+ if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions.
{
- break;
+ return;
}
- }
- outStream << "> Begin Tag: " << tag << std::endl;
+ auto children = descendantsMap.find(eventPtr);
+ if (children == descendantsMap.end())
+ {
+ return;
+ }
+
+ for (const Event* child : children->second)
+ {
+ return FindDescendantEvents(child, sequence);
+ }
+ };
+
+ // Third, find events belonging to each inference
+ int inferenceIdx = 0;
+ for (auto inference : inferences)
+ {
+ std::vector<const Event*> sequence;
+
+ //build sequence, depth first
+ FindDescendantEvents(inference, sequence);
+
+ outStream << "> Begin Inference: " << inferenceIdx << std::endl;
outStream << std::endl;
- AnalyzeEventSequenceAndWriteResults(iter, tagEndIter, outStream);
+ AnalyzeEventSequenceAndWriteResults(sequence.cbegin(),
+ sequence.cend(),
+ outStream);
outStream << std::endl;
- outStream << "> End Tag: " << tag << std::endl;
+ outStream << "> End Inference: " << inferenceIdx << std::endl;
- iter = tagEndIter;
+ inferenceIdx++;
}
}
}
@@ -253,21 +488,6 @@ void Profiler::WaitForDevice(Compute compute) const
#endif
}
-const char* Profiler::GetEventComputeDevice(Compute compute) const
-{
- switch(compute)
- {
- case Compute::CpuRef:
- return "CpuRef";
- case Compute::CpuAcc:
- return "CpuAcc";
- case Compute::GpuAcc:
- return "GpuAcc";
- default:
- return "Undefined";
- }
-}
-
std::uint32_t Profiler::GetEventColor(Compute compute) const
{
switch(compute)
@@ -287,7 +507,24 @@ std::uint32_t Profiler::GetEventColor(Compute compute) const
}
}
-} // namespace armnn
+// The thread_local pointer to the profiler instance.
+thread_local Profiler* tl_Profiler = nullptr;
+
+ProfilerManager& ProfilerManager::GetInstance()
+{
+ // Global reference to the single ProfileManager instance allowed.
+ static ProfilerManager s_ProfilerManager;
+ return s_ProfilerManager;
+}
+
+void ProfilerManager::RegisterProfiler(Profiler* profiler)
+{
+ tl_Profiler = profiler;
+}
-#endif // ARMNN_PROFILING_ENABLED
+Profiler* ProfilerManager::GetProfiler()
+{
+ return tl_Profiler;
+}
+} // namespace armnn