From c577f2c6a3b4ddb6ba87a882723c53a248afbeba Mon Sep 17 00:00:00 2001 From: telsoa01 Date: Fri, 31 Aug 2018 09:22:23 +0100 Subject: Release 18.08 --- src/armnn/Profiling.cpp | 455 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 346 insertions(+), 109 deletions(-) (limited to 'src/armnn/Profiling.cpp') diff --git a/src/armnn/Profiling.cpp b/src/armnn/Profiling.cpp index 15a195e6bd..f70f6a34d1 100644 --- a/src/armnn/Profiling.cpp +++ b/src/armnn/Profiling.cpp @@ -3,8 +3,7 @@ // See LICENSE file in the project root for full license information. // #include "Profiling.hpp" - -#if ARMNN_PROFILING_ENABLED +#include "JsonPrinter.hpp" #if ARMNN_STREAMLINE_ENABLED #include @@ -17,10 +16,12 @@ #include #include #include +#include #include #include -#include +#include +#include namespace armnn { @@ -32,86 +33,128 @@ constexpr std::size_t g_ProfilingEventCountHint = 1024; // Whether profiling reports should include the sequence of events together with their timings. constexpr bool g_WriteProfilingEventSequence = true; -// Whether profiling reports should also report detailed information on events grouped by tag. -// This is used to group stats per inference (see usage of ARMNN_UPDATE_PROFILING_EVENT_TAG in -// Runtime::EnqueueWorkload). This can spam the output stream, so use carefully (or adapt -// the code to just output information for a tag of interest). -constexpr bool g_AggregateProfilingEventsByTag = false; +// Whether profiling reports should also report detailed information on events grouped by inference. +// This can spam the output stream, so use carefully (or adapt the code to just output information +// of interest). +constexpr bool g_AggregateProfilingEventsByInference = true; -// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler -// singleton is destroyed. It can be convenient for local tests. -constexpr bool g_WriteReportToStdOutOnProfilerDestruction = true; +// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed. +// It can be convenient for local tests. +constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false; // Whether events denoting operations running on the GPU should force a sync before/after the event. // This is hardcoded to true for now as the profiling timings are not very useful without it. +#if ARMCOMPUTECL_ENABLED constexpr bool g_ProfilingForceGpuSync = true; +#endif + +Measurement FindMeasurement(const std::string& name, const Event* event) +{ + + BOOST_ASSERT(event != nullptr); + + // Search though the measurements. + for (const auto& measurement : event->GetMeasurements()) + { + if (measurement.m_Name == name) + { + // Measurement found. + return measurement; + } + } + + // Measurement not found. + return Measurement{ "", 0.f, Measurement::Unit::TIME_MS }; +} + +std::vector FindKernelMeasurements(const Event* event) +{ + BOOST_ASSERT(event != nullptr); + + std::vector measurements; + + // Search through the measurements. + for (const auto& measurement : event->GetMeasurements()) + { + if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0 + || measurement.m_Name.rfind("NeonKernelTimer", 0) == 0) + { + // Measurement found. + measurements.push_back(measurement); + } + } + + return measurements; +} std::map Profiler::CalculateProfilingEventStats() const { std::map nameToStatsMap; - for (auto&& event : m_EventSequence) + for (const auto& event : m_EventSequence) { - auto mapIter = nameToStatsMap.find(event.m_Label); - if (mapIter != nameToStatsMap.end()) + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get()); + + double durationMs = measurement.m_Value; + auto it = nameToStatsMap.find(event->GetName()); + if (it != nameToStatsMap.end()) { - ProfilingEventStats& stats = mapIter->second; - stats.m_TotalMs += event.DurationMs(); - stats.m_MinMs = std::min(stats.m_MinMs, event.DurationMs()); - stats.m_MaxMs = std::max(stats.m_MaxMs, event.DurationMs()); + ProfilingEventStats& stats = it->second; + stats.m_TotalMs += durationMs; + stats.m_MinMs = std::min(stats.m_MinMs, durationMs); + stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs); ++stats.m_Count; } else { - ProfilingEventStats stats; - stats.m_TotalMs = event.DurationMs(); - stats.m_MinMs = event.DurationMs(); - stats.m_MaxMs = event.DurationMs(); - stats.m_Count = 1; - - nameToStatsMap[event.m_Label] = stats; + nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 }); } } return nameToStatsMap; } -void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector::const_iterator first, - std::vector::const_iterator last, - std::ostream& outStream) const +const Event* GetEventPtr(const Event* ptr) { return ptr;} +const Event* GetEventPtr(const std::unique_ptr& ptr) {return ptr.get(); } + +template +void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const { - // Output event sequence, if needed + // Outputs event sequence, if needed. if (g_WriteProfilingEventSequence) { - // Make sure timestamps are output with 6 decimals, and save old settings + // Makes sure timestamps are output with 6 decimals, and save old settings. std::streamsize oldPrecision = outStream.precision(); outStream.precision(6); std::ios_base::fmtflags oldFlags = outStream.flags(); outStream.setf(std::ios::fixed); - // Output fields + // Outputs fields. outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl; for (auto event = first; event != last; ++event) { - std::chrono::duration startTimeMs = event->m_StartTime.time_since_epoch(); - std::chrono::duration stopTimeMs = event->m_StopTime.time_since_epoch(); - - outStream << std::setw(50) << event->m_Label << " " - << std::setw(20) << event->DurationMs() - << std::setw(20) << startTimeMs.count() - << std::setw(20) << stopTimeMs.count() - << std::setw(20) << Profiler::Get().GetEventComputeDevice(event->m_Device) - << std::endl; + const Event* eventPtr = GetEventPtr((*event)); + double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value; + double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value; + + // Find the WallClock measurement if there is one. + double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value; + outStream << std::setw(50) << eventPtr->GetName() << " " + << std::setw(20) << durationMs + << std::setw(20) << startTimeMs + << std::setw(20) << stopTimeMs + << std::setw(20) << GetComputeDeviceAsCString(eventPtr->GetComputeDevice()) + << std::endl; } outStream << std::endl; - // Restore previous precision settings + // Restores previous precision settings. outStream.flags(oldFlags); outStream.precision(oldPrecision); } - // Aggregate results per event name + // Aggregates results per event name. std::map nameToStatsMap = CalculateProfilingEventStats(); - // Output aggregated stats + // Outputs aggregated stats. outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl; for (const auto& pair : nameToStatsMap) { @@ -126,74 +169,236 @@ void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector:: outStream << std::endl; } -Profiler Profiler::s_Instance; - Profiler::Profiler() - : m_EventTag(0) - , m_NestingLevel(0) - , m_EventTagUpdated(false) + : m_ProfilingEnabled(false) { m_EventSequence.reserve(g_ProfilingEventCountHint); #if ARMNN_STREAMLINE_ENABLED - // Initialise streamline annotations + // Initialises streamline annotations. ANNOTATE_SETUP; #endif } Profiler::~Profiler() { - if (g_WriteReportToStdOutOnProfilerDestruction) + if (m_ProfilingEnabled) { - AnalyzeEventsAndWriteResults(std::cout); + if (g_WriteReportToStdOutOnProfilerDestruction) + { + Print(std::cout); + } } + + // Un-register this profiler from the current thread. + ProfilerManager::GetInstance().RegisterProfiler(nullptr); } -void Profiler::BeginEvent(Compute compute, const std::string label) +bool Profiler::IsProfilingEnabled() +{ + return m_ProfilingEnabled; +} + +void Profiler::EnableProfiling(bool enableProfiling) +{ + m_ProfilingEnabled = enableProfiling; +} + +Event* Profiler::BeginEvent(Compute compute, const std::string& label, std::vector&& instruments) { // We need to sync just before the begin event to not include time before the period we want to time. WaitForDevice(compute); - const TimePoint timeStamp = Clock::now(); - m_ObservedMarkers.emplace(Marker{m_EventSequence.size(), label, timeStamp, compute, m_EventTag}); - m_EventSequence.emplace_back(); + Event* parent = m_Parents.empty() ? nullptr : m_Parents.top(); + m_EventSequence.push_back(std::make_unique(label, this, parent, compute, std::move(instruments))); + Event* event = m_EventSequence.back().get(); + event->Start(); #if ARMNN_STREAMLINE_ENABLED - ANNOTATE_CHANNEL_COLOR(m_NestingLevel, GetEventColor(compute), label.c_str()); + ANNOTATE_CHANNEL_COLOR(m_Parents.size(), GetEventColor(compute), label.c_str()); #endif - m_NestingLevel++; + m_Parents.push(event); + return event; } -void Profiler::EndEvent(Compute compute) +void Profiler::EndEvent(Event* event) { - // We need to sync just before the end event to include all the time of the timed period. - WaitForDevice(compute); - - const Marker& marker = m_ObservedMarkers.top(); + event->Stop(); - const TimePoint startTime = marker.m_TimeStamp; - const TimePoint stopTime = Clock::now(); + BOOST_ASSERT(!m_Parents.empty()); + BOOST_ASSERT(event == m_Parents.top()); + m_Parents.pop(); - m_EventSequence[marker.m_Id] = {std::move(marker.m_EventName), - startTime, - stopTime, - marker.m_ComputeDevice, - marker.m_Tag}; - - m_ObservedMarkers.pop(); + Event* parent = m_Parents.empty() ? nullptr : m_Parents.top(); + boost::ignore_unused(parent); + BOOST_ASSERT(event->GetParentEvent() == parent); #if ARMNN_STREAMLINE_ENABLED - ANNOTATE_CHANNEL_END(m_NestingLevel); + ANNOTATE_CHANNEL_END(m_Parents.size()); #endif +} + +int CalcLevel(const Event* eventPtr) +{ + int level=0; + while (eventPtr != nullptr) + { + eventPtr = eventPtr->GetParentEvent(); + level++; + } + return level; +} + +void Profiler::PopulateInferences(std::vector& outInferences, int& outBaseLevel) const +{ + outInferences.reserve(m_EventSequence.size()); + for (const auto& event : m_EventSequence) + { + const Event* eventPtrRaw = event.get(); + if (eventPtrRaw->GetName() == "EnqueueWorkload") + { + outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel; + outInferences.push_back(eventPtrRaw); + } + } +} + +void Profiler::PopulateDescendants(std::map>& outDescendantsMap) const +{ + for (const auto& event : m_EventSequence) + { + const Event* eventPtrRaw = event.get(); + const Event* parent = eventPtrRaw->GetParentEvent(); + + if (!parent) + { + continue; + } + + auto it = outDescendantsMap.find(parent); + if (it == outDescendantsMap.end()) + { + outDescendantsMap.emplace(parent, std::vector({eventPtrRaw})); + } + else + { + it->second.push_back(eventPtrRaw); + } + } +} + +void Profiler::Print(std::ostream& outStream) const +{ + // Makes sure timestamps are output with 6 decimals, and save old settings. + std::streamsize oldPrecision = outStream.precision(); + outStream.precision(6); + std::ios_base::fmtflags oldFlags = outStream.flags(); + outStream.setf(std::ios::fixed); + JsonPrinter printer(outStream); + + // First find all the "inference" Events and print out duration measurements. + int baseLevel = -1; + std::vector inferences; + PopulateInferences(inferences, baseLevel); + + // Second map out descendants hierarchy + std::map> descendantsMap; + PopulateDescendants(descendantsMap); + + JsonChildObject inferenceObject{"inference_measurements"}; + JsonChildObject layerObject{"layer_measurements"}; + std::vector workloadObjects; + std::map> workloadToKernelObjects; + + for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex) + { + auto inference = inferences[inferenceIndex]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, inference); + inferenceObject.SetUnit(measurement.m_Unit); + inferenceObject.AddMeasurement(measurement.m_Value); + + auto layerEventsIt = descendantsMap.find(inference); + + // Assuming 1 Execute per inference + if (layerEventsIt != descendantsMap.end()) + { + auto layerEvent = layerEventsIt->second[0]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, layerEvent); + layerObject.SetUnit(measurement.m_Unit); + layerObject.AddMeasurement(measurement.m_Value); + + // Get Descendant Events for Execute + auto workloadEventsIt = descendantsMap.find(layerEvent); + for(unsigned int workloadIndex = 0; workloadIndex < workloadEventsIt->second.size(); ++workloadIndex) + { + auto workloadEvent = workloadEventsIt->second[workloadIndex]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, workloadEvent); + std::vector kernelMeasurements = FindKernelMeasurements(workloadEvent); + if (inferenceIndex == 0) + { + // Only add second level once, in case of multiple inferences + JsonChildObject workloadObject{workloadEvent->GetName()}; + workloadObject.SetUnit(measurement.m_Unit); + workloadObjects.push_back(workloadObject); + } + workloadObjects[workloadIndex].AddMeasurement(measurement.m_Value); + + for(unsigned int kernelIndex = 0; kernelIndex < kernelMeasurements.size(); ++kernelIndex) + { + if (inferenceIndex == 0) + { + // Only add kernel measurement once, in case of multiple inferences + JsonChildObject kernelObject{kernelMeasurements[kernelIndex].m_Name}; + kernelObject.SetUnit(kernelMeasurements[kernelIndex].m_Unit); + workloadToKernelObjects[workloadIndex].push_back(kernelObject); + + } + workloadToKernelObjects[workloadIndex][kernelIndex]. + AddMeasurement(kernelMeasurements[kernelIndex].m_Value); + } + } + } + } + + for (auto workloadToKernelPair : workloadToKernelObjects) + { + for (auto kernelObject : workloadToKernelPair.second) + { + workloadObjects[workloadToKernelPair.first].AddChild(kernelObject); + } + } - m_NestingLevel--; + for (auto workloadObject : workloadObjects) + { + layerObject.AddChild(workloadObject); + } + inferenceObject.AddChild(layerObject); + + printer.PrintHeader(); + printer.PrintArmNNHeader(); + + // print inference object, also prints child layer and kernel measurements + printer.PrintJsonChildObject(inferenceObject); + + // end of ArmNN + printer.PrintNewLine(); + printer.PrintFooter(); + + // end of main JSON object + printer.PrintNewLine(); + printer.PrintFooter(); + printer.PrintNewLine(); + + // Restores previous precision settings. + outStream.flags(oldFlags); + outStream.precision(oldPrecision); } void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const { // Stack should be empty now. - const bool saneMarkerSequence = m_ObservedMarkers.empty(); + const bool saneMarkerSequence = m_Parents.empty(); // Abort if the sequence of markers was found to have incorrect information: // The stats cannot be trusted. @@ -206,39 +411,69 @@ void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const return; } - // Analyze the full sequence of events - AnalyzeEventSequenceAndWriteResults(m_EventSequence.begin(), m_EventSequence.end(), outStream); + // Analyzes the full sequence of events. + AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(), + m_EventSequence.cend(), + outStream); - // Aggregate events by tag if requested (spams the output stream if done for all tags) - if (m_EventTagUpdated && g_AggregateProfilingEventsByTag) + // Aggregates events by tag if requested (spams the output stream if done for all tags). + if (g_AggregateProfilingEventsByInference) { outStream << std::endl; outStream << "***" << std::endl; - outStream << "*** Per Tag Stats" << std::endl; + outStream << "*** Per Inference Stats" << std::endl; outStream << "***" << std::endl; outStream << std::endl; - for (auto iter = m_EventSequence.begin(); iter != m_EventSequence.end();) - { - const uint32_t tag = iter->m_Tag; + int baseLevel = -1; + std::vector inferences; + PopulateInferences(inferences, baseLevel); - // Advance iter until we find the first non-matching tag - auto tagEndIter = iter; - for (; tagEndIter != m_EventSequence.end(); ++tagEndIter) + // Second map out descendants hierarchy + std::map> descendantsMap; + PopulateDescendants(descendantsMap); + + std::function&)> + FindDescendantEvents = [&](const Event* eventPtr, + std::vector& sequence) { - if (tagEndIter->m_Tag != tag) + sequence.push_back(eventPtr); + + if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions. { - break; + return; } - } - outStream << "> Begin Tag: " << tag << std::endl; + auto children = descendantsMap.find(eventPtr); + if (children == descendantsMap.end()) + { + return; + } + + for (const Event* child : children->second) + { + return FindDescendantEvents(child, sequence); + } + }; + + // Third, find events belonging to each inference + int inferenceIdx = 0; + for (auto inference : inferences) + { + std::vector sequence; + + //build sequence, depth first + FindDescendantEvents(inference, sequence); + + outStream << "> Begin Inference: " << inferenceIdx << std::endl; outStream << std::endl; - AnalyzeEventSequenceAndWriteResults(iter, tagEndIter, outStream); + AnalyzeEventSequenceAndWriteResults(sequence.cbegin(), + sequence.cend(), + outStream); outStream << std::endl; - outStream << "> End Tag: " << tag << std::endl; + outStream << "> End Inference: " << inferenceIdx << std::endl; - iter = tagEndIter; + inferenceIdx++; } } } @@ -253,21 +488,6 @@ void Profiler::WaitForDevice(Compute compute) const #endif } -const char* Profiler::GetEventComputeDevice(Compute compute) const -{ - switch(compute) - { - case Compute::CpuRef: - return "CpuRef"; - case Compute::CpuAcc: - return "CpuAcc"; - case Compute::GpuAcc: - return "GpuAcc"; - default: - return "Undefined"; - } -} - std::uint32_t Profiler::GetEventColor(Compute compute) const { switch(compute) @@ -287,7 +507,24 @@ std::uint32_t Profiler::GetEventColor(Compute compute) const } } -} // namespace armnn +// The thread_local pointer to the profiler instance. +thread_local Profiler* tl_Profiler = nullptr; + +ProfilerManager& ProfilerManager::GetInstance() +{ + // Global reference to the single ProfileManager instance allowed. + static ProfilerManager s_ProfilerManager; + return s_ProfilerManager; +} + +void ProfilerManager::RegisterProfiler(Profiler* profiler) +{ + tl_Profiler = profiler; +} -#endif // ARMNN_PROFILING_ENABLED +Profiler* ProfilerManager::GetProfiler() +{ + return tl_Profiler; +} +} // namespace armnn -- cgit v1.2.1