// // Copyright © 2017 Arm Ltd. All rights reserved. // See LICENSE file in the project root for full license information. // #include "Profiling.hpp" #if ARMNN_PROFILING_ENABLED #if ARMNN_STREAMLINE_ENABLED #include #endif #if ARMCOMPUTECL_ENABLED #include #endif #include #include #include #include #include #include namespace armnn { // Controls the amount of memory initially allocated to store profiling events. // If chosen carefully, the profiling system will not make any additional allocations, thus minimizing its impact on // measured times. constexpr std::size_t g_ProfilingEventCountHint = 1024; // Whether profiling reports should include the sequence of events together with their timings. constexpr bool g_WriteProfilingEventSequence = true; // Whether profiling reports should also report detailed information on events grouped by tag. // This is used to group stats per inference (see usage of ARMNN_UPDATE_PROFILING_EVENT_TAG in // Runtime::EnqueueWorkload). This can spam the output stream, so use carefully (or adapt // the code to just output information for a tag of interest). constexpr bool g_AggregateProfilingEventsByTag = false; // Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler // singleton is destroyed. It can be convenient for local tests. constexpr bool g_WriteReportToStdOutOnProfilerDestruction = true; // Whether events denoting operations running on the GPU should force a sync before/after the event. // This is hardcoded to true for now as the profiling timings are not very useful without it. constexpr bool g_ProfilingForceGpuSync = true; std::map Profiler::CalculateProfilingEventStats() const { std::map nameToStatsMap; for (auto&& event : m_EventSequence) { auto mapIter = nameToStatsMap.find(event.m_Label); if (mapIter != nameToStatsMap.end()) { ProfilingEventStats& stats = mapIter->second; stats.m_TotalMs += event.DurationMs(); stats.m_MinMs = std::min(stats.m_MinMs, event.DurationMs()); stats.m_MaxMs = std::max(stats.m_MaxMs, event.DurationMs()); ++stats.m_Count; } else { ProfilingEventStats stats; stats.m_TotalMs = event.DurationMs(); stats.m_MinMs = event.DurationMs(); stats.m_MaxMs = event.DurationMs(); stats.m_Count = 1; nameToStatsMap[event.m_Label] = stats; } } return nameToStatsMap; } void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector::const_iterator first, std::vector::const_iterator last, std::ostream& outStream) const { // Output event sequence, if needed if (g_WriteProfilingEventSequence) { // Make sure timestamps are output with 6 decimals, and save old settings std::streamsize oldPrecision = outStream.precision(); outStream.precision(6); std::ios_base::fmtflags oldFlags = outStream.flags(); outStream.setf(std::ios::fixed); // Output fields outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl; for (auto event = first; event != last; ++event) { std::chrono::duration startTimeMs = event->m_StartTime.time_since_epoch(); std::chrono::duration stopTimeMs = event->m_StopTime.time_since_epoch(); outStream << std::setw(50) << event->m_Label << " " << std::setw(20) << event->DurationMs() << std::setw(20) << startTimeMs.count() << std::setw(20) << stopTimeMs.count() << std::setw(20) << Profiler::Get().GetEventComputeDevice(event->m_Device) << std::endl; } outStream << std::endl; // Restore previous precision settings outStream.flags(oldFlags); outStream.precision(oldPrecision); } // Aggregate results per event name std::map nameToStatsMap = CalculateProfilingEventStats(); // Output aggregated stats outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl; for (const auto& pair : nameToStatsMap) { const std::string& eventLabel = pair.first; const ProfilingEventStats& eventStats = pair.second; const double avgMs = eventStats.m_TotalMs / double(eventStats.m_Count); outStream << "\t" << std::setw(50) << eventLabel << " " << std::setw(9) << avgMs << " " << std::setw(9) << eventStats.m_MinMs << " " << std::setw(9) << eventStats.m_MaxMs << " " << std::setw(9) << eventStats.m_TotalMs << " " << std::setw(9) << eventStats.m_Count << std::endl; } outStream << std::endl; } Profiler Profiler::s_Instance; Profiler::Profiler() : m_EventTag(0) , m_NestingLevel(0) , m_EventTagUpdated(false) { m_EventSequence.reserve(g_ProfilingEventCountHint); #if ARMNN_STREAMLINE_ENABLED // Initialise streamline annotations ANNOTATE_SETUP; #endif } Profiler::~Profiler() { if (g_WriteReportToStdOutOnProfilerDestruction) { AnalyzeEventsAndWriteResults(std::cout); } } void Profiler::BeginEvent(Compute compute, const std::string label) { // We need to sync just before the begin event to not include time before the period we want to time. WaitForDevice(compute); const TimePoint timeStamp = Clock::now(); m_ObservedMarkers.emplace(Marker{m_EventSequence.size(), label, timeStamp, compute, m_EventTag}); m_EventSequence.emplace_back(); #if ARMNN_STREAMLINE_ENABLED ANNOTATE_CHANNEL_COLOR(m_NestingLevel, GetEventColor(compute), label.c_str()); #endif m_NestingLevel++; } void Profiler::EndEvent(Compute compute) { // We need to sync just before the end event to include all the time of the timed period. WaitForDevice(compute); const Marker& marker = m_ObservedMarkers.top(); const TimePoint startTime = marker.m_TimeStamp; const TimePoint stopTime = Clock::now(); m_EventSequence[marker.m_Id] = {std::move(marker.m_EventName), startTime, stopTime, marker.m_ComputeDevice, marker.m_Tag}; m_ObservedMarkers.pop(); #if ARMNN_STREAMLINE_ENABLED ANNOTATE_CHANNEL_END(m_NestingLevel); #endif m_NestingLevel--; } void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const { // Stack should be empty now. const bool saneMarkerSequence = m_ObservedMarkers.empty(); // Abort if the sequence of markers was found to have incorrect information: // The stats cannot be trusted. if (!saneMarkerSequence) { outStream << "Cannot write profiling stats. " "Unexpected errors were found when analyzing the sequence of logged events, which may lead to plainly " "wrong stats. The profiling system may contain implementation issues or could have been used in an " "unsafe manner." << std::endl; return; } // Analyze the full sequence of events AnalyzeEventSequenceAndWriteResults(m_EventSequence.begin(), m_EventSequence.end(), outStream); // Aggregate events by tag if requested (spams the output stream if done for all tags) if (m_EventTagUpdated && g_AggregateProfilingEventsByTag) { outStream << std::endl; outStream << "***" << std::endl; outStream << "*** Per Tag Stats" << std::endl; outStream << "***" << std::endl; outStream << std::endl; for (auto iter = m_EventSequence.begin(); iter != m_EventSequence.end();) { const uint32_t tag = iter->m_Tag; // Advance iter until we find the first non-matching tag auto tagEndIter = iter; for (; tagEndIter != m_EventSequence.end(); ++tagEndIter) { if (tagEndIter->m_Tag != tag) { break; } } outStream << "> Begin Tag: " << tag << std::endl; outStream << std::endl; AnalyzeEventSequenceAndWriteResults(iter, tagEndIter, outStream); outStream << std::endl; outStream << "> End Tag: " << tag << std::endl; iter = tagEndIter; } } } void Profiler::WaitForDevice(Compute compute) const { #if ARMCOMPUTECL_ENABLED if(compute == Compute::GpuAcc && g_ProfilingForceGpuSync) { arm_compute::CLScheduler::get().sync(); } #endif } const char* Profiler::GetEventComputeDevice(Compute compute) const { switch(compute) { case Compute::CpuRef: return "CpuRef"; case Compute::CpuAcc: return "CpuAcc"; case Compute::GpuAcc: return "GpuAcc"; default: return "Undefined"; } } std::uint32_t Profiler::GetEventColor(Compute compute) const { switch(compute) { case Compute::CpuRef: // Cyan return 0xffff001b; case Compute::CpuAcc: // Green return 0x00ff001b; case Compute::GpuAcc: // Purple return 0xff007f1b; default: // Dark gray return 0x5555551b; } } } // namespace armnn #endif // ARMNN_PROFILING_ENABLED